From c25c09b0688bcda1da148fbcac47adab697c409c Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Thu, 3 Jun 2021 15:02:26 +0530
Subject: [PATCH 01/26] ENH: r, raw, economic now accept stacked matrices

---
 numpy/linalg/linalg.py          |  31 +--
 numpy/linalg/umath_linalg.c.src | 326 ++++++++++++++++++++++++++++++++
 2 files changed, 335 insertions(+), 22 deletions(-)

diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 46fb2502e5cc..087156bd0aa6 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -904,34 +904,21 @@ def qr(a, mode='reduced'):
             raise ValueError(f"Unrecognized mode '{mode}'")
 
     a, wrap = _makearray(a)
-    _assert_2d(a)
-    m, n = a.shape
+    _assert_stacked_2d(a)
+    m, n = a.shape[-2:]
     t, result_t = _commonType(a)
-    a = _fastCopyAndTranspose(t, a)
+    a = a.astype(t, copy=True)
     a = _to_native_byte_order(a)
     mn = min(m, n)
-    tau = zeros((mn,), t)
 
-    if isComplexType(t):
-        lapack_routine = lapack_lite.zgeqrf
-        routine_name = 'zgeqrf'
+    if m <= n:
+        gufunc = _umath_linalg.qr_r_raw_e_m
     else:
-        lapack_routine = lapack_lite.dgeqrf
-        routine_name = 'dgeqrf'
+        gufunc = _umath_linalg.qr_r_raw_e_n
 
-    # calculate optimal size of work data 'work'
-    lwork = 1
-    work = zeros((lwork,), t)
-    results = lapack_routine(m, n, a, max(1, m), tau, work, -1, 0)
-    if results['info'] != 0:
-        raise LinAlgError('%s returns %d' % (routine_name, results['info']))
-
-    # do qr decomposition
-    lwork = max(1, n, int(abs(work[0])))
-    work = zeros((lwork,), t)
-    results = lapack_routine(m, n, a, max(1, m), tau, work, lwork, 0)
-    if results['info'] != 0:
-        raise LinAlgError('%s returns %d' % (routine_name, results['info']))
+    signature = 'D->D' if isComplexType(t) else 'd->d'
+    extobj = get_linalg_error_extobj(_raise_linalgerror_lstsq)
+    tau = gufunc(a, signature=signature, extobj=extobj)
 
     # handle modes that don't return q
     if mode == 'r':
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 1807aadcf584..a2cc86dc47b8 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -162,6 +162,23 @@ FNAME(zgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs,
               double rwork[], fortran_int iwork[],
               fortran_int *info);
 
+extern fortran_int 
+FNAME(sgeqrf)(fortran_int *m, fortran_int *n, float a[], fortran_int *lda,
+              float tau[], float work[],
+              fortran_int *lwork, fortran_int *info);
+extern fortran_int 
+FNAME(cgeqrf)(fortran_int *m, fortran_int *n, f2c_complex a[], fortran_int *lda,
+              f2c_complex tau[], f2c_complex work[],
+              fortran_int *lwork, fortran_int *info);
+extern fortran_int 
+FNAME(dgeqrf)(fortran_int *m, fortran_int *n, double a[], fortran_int *lda,
+              double tau[], double work[],
+              fortran_int *lwork, fortran_int *info);
+extern fortran_int 
+FNAME(zgeqrf)(fortran_int *m, fortran_int *n, f2c_doublecomplex a[], fortran_int *lda,
+              f2c_doublecomplex tau[], f2c_doublecomplex work[],
+              fortran_int *lwork, fortran_int *info);
+
 extern fortran_int
 FNAME(sgesv)(fortran_int *n, fortran_int *nrhs,
              float a[], fortran_int *lda,
@@ -3235,6 +3252,289 @@ static void
 
 /**end repeat**/
 
+/* -------------------------------------------------------------------------- */
+                 /* qr (modes - r, raw, economic) */
+
+typedef struct geqfr_params_struct
+{
+    fortran_int M;
+    fortran_int N;
+    void *A;
+    fortran_int LDA;
+    void* TAU;
+    void *WORK;
+    fortran_int LWORK;
+} GEQRF_PARAMS_t;
+
+
+static inline void
+dump_geqrf_params(const char *name,
+                  GEQRF_PARAMS_t *params)
+{
+    TRACE_TXT("\n%s:\n"\
+
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"
+              "%14s: %18d\n",
+
+              name,
+
+              "A", params->A,
+              "TAU", params->TAU,
+              "WORK", params->WORK,
+
+              "M", (int)params->M,
+              "N", (int)params->N,
+              "LDA", (int)params->LDA,
+              "LWORK", (int)params->LWORK);
+}
+
+/**begin repeat
+   #lapack_func=sgeqrf,dgeqrf,cgeqrf,zgeqrf#
+ */
+
+static inline fortran_int
+call_@lapack_func@(GEQRF_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->M, &params->N,
+                          params->A, &params->LDA,
+                          params->TAU,
+                          params->WORK, &params->LWORK,
+                          &rv);
+    return rv;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE#
+   #lapack_func=sgeqrf,dgeqrf#
+   #ftyp=fortran_real,fortran_doublereal#
+ */
+static inline int
+init_@lapack_func@(GEQRF_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *tau, *work;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    fortran_int lda = fortran_int_max(1, m);
+
+    mem_buff = malloc(a_size + tau_size);
+
+    if (!mem_buff)
+        goto error;
+
+    a = mem_buff;
+    tau = a + a_size;
+    memset(tau, 0, tau_size);
+
+
+    params->M = m;
+    params->N = n;
+    params->A = a;
+    params->TAU = tau;
+    params->LDA = lda;
+
+    {
+        /* compute optimal work size */
+        /* IS THE FOLLOWING METHOD CORRECT? */
+        @ftyp@ work_size_query;
+
+        params->WORK = &work_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int)work_size_query;
+
+        work_size  = (size_t) work_size_query * sizeof(@ftyp@);
+    }
+
+    mem_buff2 = malloc(work_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+    memset(work, 0, work_size);
+
+    params->WORK = work;
+    params->LWORK = work_count;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=CFLOAT,CDOUBLE#
+   #lapack_func=cgeqrf,zgeqrf#
+   #ftyp=fortran_complex,fortran_doublecomplex#
+ */
+static inline int
+init_@lapack_func@(GEQRF_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *tau, *work;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    fortran_int lda = fortran_int_max(1, m);
+
+    mem_buff = malloc(a_size + tau_size);
+
+    if (!mem_buff)
+        goto error;
+
+    a = mem_buff;
+    tau = a + tau_size;
+    memset(tau, 0, tau_size);
+
+
+    params->M = m;
+    params->N = n;
+    params->A = a;
+    params->TAU = tau;
+    params->LDA = lda;
+
+    {
+        /* compute optimal work size */
+        /* IS THE FOLLOWING METHOD CORRECT? */
+        @ftyp@ work_size_query;
+
+        params->WORK = &work_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int)work_size_query.r;
+
+        work_size  = (size_t) work_size_query.r * sizeof(@ftyp@);
+    }
+
+    mem_buff2 = malloc(work_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+
+    params->WORK = work;
+    params->LWORK = work_count;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #lapack_func=sgeqrf,dgeqrf,cgeqrf,zgeqrf#
+ */
+static inline void
+release_@lapack_func@(GEQRF_PARAMS_t* params)
+{
+    /* A and WORK contain allocated blocks */
+    free(params->A);
+    free(params->WORK);
+    memset(params, 0, sizeof(*params));
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
+   #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
+   #lapack_func=sgeqrf,dgeqrf,cgeqrf,zgeqrf#
+   #dot_func=sdot,ddot,cdotc,zdotc#
+   #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
+   #basetyp = npy_float, npy_double, npy_float,  npy_double#
+   #ftyp = fortran_real, fortran_doublereal,
+           fortran_complex, fortran_doublecomplex#
+   #cmplx = 0, 0, 1, 1#
+ */
+static void
+@TYPE@_qr_r_raw_e(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func))
+{
+    GEQRF_PARAMS_t params;
+    int error_occurred = get_fp_invalid_and_clear();
+    fortran_int n, m;
+
+    INIT_OUTER_LOOP_2
+
+    m = (fortran_int)dimensions[0];
+    n = (fortran_int)dimensions[1];
+
+    if (init_@lapack_func@(&params, m, n)) {
+        LINEARIZE_DATA_t a_in, a_out, tau_out;
+
+        init_linearize_data(&a_in, n, m, steps[1], steps[0]);
+        init_linearize_data(&a_out, m, n, steps[0], steps[1]);
+        init_linearize_data(&tau_out, 1, fortran_int_min(m, n), 1, steps[2]);
+
+        BEGIN_OUTER_LOOP_2
+            int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                delinearize_@TYPE@_matrix(args[0], params.A, &a_out);
+                delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_out);
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[0], &a_in);
+                nan_@TYPE@_matrix(args[1], &tau_out);
+            }
+        END_OUTER_LOOP
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+/**end repeat**/
+
 #pragma GCC diagnostic pop
 
 /* -------------------------------------------------------------------------- */
@@ -3306,6 +3606,7 @@ GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_N);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_S);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(lstsq);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(qr_r_raw_e);
 GUFUNC_FUNC_ARRAY_EIG(eig);
 GUFUNC_FUNC_ARRAY_EIG(eigvals);
 
@@ -3379,6 +3680,13 @@ static char lstsq_types[] = {
     NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_INT, NPY_DOUBLE,
 };
 
+static char qr_r_raw_e_types[] = {
+    NPY_FLOAT,   NPY_FLOAT,
+    NPY_DOUBLE,  NPY_DOUBLE,
+    NPY_CFLOAT,  NPY_CFLOAT,
+    NPY_CDOUBLE, NPY_CDOUBLE,
+};
+
 typedef struct gufunc_descriptor_struct {
     char *name;
     char *signature;
@@ -3587,6 +3895,24 @@ GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
         4, 3, 4,
         FUNC_ARRAY_NAME(lstsq),
         lstsq_types
+    },
+    {
+        "qr_r_raw_e_m",
+        "(m,n)->(m)",
+        "Compute TAU vector for the last two dimensions \n"\
+        "and broadcast to the rest. For m <= n. \n",
+        4, 1, 1,
+        FUNC_ARRAY_NAME(qr_r_raw_e),
+        qr_r_raw_e_types
+    },
+    {
+        "qr_r_raw_e_n",
+        "(m,n)->(n)",
+        "Compute TAU vector for the last two dimensions \n"\
+        "and broadcast to the rest. For m > n. \n",
+        4, 1, 1,
+        FUNC_ARRAY_NAME(qr_r_raw_e),
+        qr_r_raw_e_types
     }
 };
 

From d5cbb5e0eefec9c97a901f1efa3e1a40d1ec0b04 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Fri, 4 Jun 2021 11:59:36 +0530
Subject: [PATCH 02/26] WIP

---
 numpy/linalg/linalg.py          | 100 +++++----
 numpy/linalg/umath_linalg.c.src | 365 ++++++++++++++++++++++++++++++--
 2 files changed, 407 insertions(+), 58 deletions(-)

diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 087156bd0aa6..97d130894182 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -99,6 +99,9 @@ def _raise_linalgerror_svd_nonconvergence(err, flag):
 def _raise_linalgerror_lstsq(err, flag):
     raise LinAlgError("SVD did not converge in Linear Least Squares")
 
+def _raise_linalgerror_qr_r_raw(err, flag):
+    raise LinAlgError("Illegal argument found while performing QR factorization")
+
 def get_linalg_error_extobj(callback):
     extobj = list(_linalg_error_extobj)  # make a copy
     extobj[2] = callback
@@ -766,12 +769,12 @@ def cholesky(a):
 
 # QR decomposition
 
-def _qr_dispatcher(a, mode=None):
+def _qr_dispatcher(a, mode=None, old=None):
     return (a,)
 
 
 @array_function_dispatch(_qr_dispatcher)
-def qr(a, mode='reduced'):
+def qr(a, mode='reduced', old=True):
     """
     Compute the qr factorization of a matrix.
 
@@ -912,62 +915,75 @@ def qr(a, mode='reduced'):
     mn = min(m, n)
 
     if m <= n:
-        gufunc = _umath_linalg.qr_r_raw_e_m
+        gufunc = _umath_linalg.qr_r_raw_m
     else:
-        gufunc = _umath_linalg.qr_r_raw_e_n
+        gufunc = _umath_linalg.qr_r_raw_n
 
     signature = 'D->D' if isComplexType(t) else 'd->d'
-    extobj = get_linalg_error_extobj(_raise_linalgerror_lstsq)
+    extobj = get_linalg_error_extobj(_raise_linalgerror_qr_r_raw)
     tau = gufunc(a, signature=signature, extobj=extobj)
 
     # handle modes that don't return q
     if mode == 'r':
-        r = _fastCopyAndTranspose(result_t, a[:, :mn])
-        return wrap(triu(r))
+        return wrap(triu(a.T))
 
     if mode == 'raw':
-        return a, tau
+        return wrap(a.T), tau
 
     if mode == 'economic':
         if t != result_t :
             a = a.astype(result_t, copy=False)
-        return wrap(a.T)
-
-    #  generate q from a
-    if mode == 'complete' and m > n:
-        mc = m
-        q = empty((m, m), t)
-    else:
-        mc = mn
-        q = empty((n, m), t)
-    q[:n] = a
-
-    if isComplexType(t):
-        lapack_routine = lapack_lite.zungqr
-        routine_name = 'zungqr'
-    else:
-        lapack_routine = lapack_lite.dorgqr
-        routine_name = 'dorgqr'
+        return wrap(a)
 
-    # determine optimal lwork
-    lwork = 1
-    work = zeros((lwork,), t)
-    results = lapack_routine(m, mc, mn, q, max(1, m), tau, work, -1, 0)
-    if results['info'] != 0:
-        raise LinAlgError('%s returns %d' % (routine_name, results['info']))
-
-    # compute q
-    lwork = max(1, n, int(abs(work[0])))
-    work = zeros((lwork,), t)
-    results = lapack_routine(m, mc, mn, q, max(1, m), tau, work, lwork, 0)
-    if results['info'] != 0:
-        raise LinAlgError('%s returns %d' % (routine_name, results['info']))
-
-    q = _fastCopyAndTranspose(result_t, q[:mc])
-    r = _fastCopyAndTranspose(result_t, a[:, :mc])
+    if old:
+        #  generate q from a
+        if mode == 'complete' and m > n:
+            mc = m
+            q = empty((m, m), t)
+        else:
+            mc = mn
+            q = empty((m, n), t)
+        q[:n] = a
 
-    return wrap(q), wrap(triu(r))
+        if isComplexType(t):
+            lapack_routine = lapack_lite.zungqr
+            routine_name = 'zungqr'
+        else:
+            lapack_routine = lapack_lite.dorgqr
+            routine_name = 'dorgqr'
+
+        # determine optimal lwork
+        lwork = 1
+        work = zeros((lwork,), t)
+        results = lapack_routine(m, mc, mn, q, max(1, m), tau, work, -1, 0)
+        if results['info'] != 0:
+            raise LinAlgError('%s returns %d' % (routine_name, results['info']))
+
+        # compute q
+        lwork = max(1, n, int(abs(work[0])))
+        work = zeros((lwork,), t)
+        results = lapack_routine(m, mc, mn, q, max(1, m), tau, work, lwork, 0)
+        if results['info'] != 0:
+            raise LinAlgError('%s returns %d' % (routine_name, results['info']))
+
+        q = _fastCopyAndTranspose(result_t, q[:mc])
+        r = _fastCopyAndTranspose(result_t, a[:, :mc])
+
+        return wrap(q), wrap(triu(r))
+
+    if mode == 'reduced':
+        if m <= n:
+            gufunc = _umath_linalg.qr_reduced_m
+        else:
+            gufunc = _umath_linalg.qr_reduced_n
+    
+    print(gufunc)
+    signature = 'DD->D' if isComplexType(t) else 'dd->d'
+    extobj = get_linalg_error_extobj(_raise_linalgerror_qr_r_raw)
+    q = gufunc(a, tau, signature=signature, extobj=extobj)
 
+    r = _fastCopyAndTranspose(result_t, a[:, :mn])
+    return q, wrap(triu(r))
 
 # Eigenvalues
 
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index a2cc86dc47b8..045b70003118 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -179,6 +179,23 @@ FNAME(zgeqrf)(fortran_int *m, fortran_int *n, f2c_doublecomplex a[], fortran_int
               f2c_doublecomplex tau[], f2c_doublecomplex work[],
               fortran_int *lwork, fortran_int *info);
 
+extern fortran_int 
+FNAME(sorgqr)(fortran_int *m, fortran_int *n, fortran_int *k, float a[], fortran_int *lda,
+              float tau[], float work[],
+              fortran_int *lwork, fortran_int *info);
+extern fortran_int 
+FNAME(dorgqr)(fortran_int *m, fortran_int *n, fortran_int *k, double a[], fortran_int *lda,
+              double tau[], double work[],
+              fortran_int *lwork, fortran_int *info);
+extern fortran_int 
+FNAME(cungqr)(fortran_int *m, fortran_int *n, fortran_int *k, f2c_complex a[],
+              fortran_int *lda, f2c_complex tau[],
+              f2c_complex work[], fortran_int *lwork, fortran_int *info);
+extern fortran_int 
+FNAME(zungqr)(fortran_int *m, fortran_int *n, fortran_int *k, f2c_doublecomplex a[],
+              fortran_int *lda, f2c_doublecomplex tau[],
+              f2c_doublecomplex work[], fortran_int *lwork, fortran_int *info);
+
 extern fortran_int
 FNAME(sgesv)(fortran_int *n, fortran_int *nrhs,
              float a[], fortran_int *lda,
@@ -3253,7 +3270,7 @@ static void
 /**end repeat**/
 
 /* -------------------------------------------------------------------------- */
-                 /* qr (modes - r, raw, economic) */
+                 /* qr (modes - r, raw) */
 
 typedef struct geqfr_params_struct
 {
@@ -3278,7 +3295,7 @@ dump_geqrf_params(const char *name,
               "%14s: %18p\n"\
               "%14s: %18d\n"\
               "%14s: %18d\n"\
-              "%14s: %18d\n"
+              "%14s: %18d\n"\
               "%14s: %18d\n",
 
               name,
@@ -3421,7 +3438,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
         goto error;
 
     a = mem_buff;
-    tau = a + tau_size;
+    tau = a + a_size;
     memset(tau, 0, tau_size);
 
 
@@ -3452,6 +3469,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
         goto error;
 
     work = mem_buff2;
+    memset(work, 0, work_size);
 
     params->WORK = work;
     params->LWORK = work_count;
@@ -3486,7 +3504,6 @@ release_@lapack_func@(GEQRF_PARAMS_t* params)
    #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
    #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
    #lapack_func=sgeqrf,dgeqrf,cgeqrf,zgeqrf#
-   #dot_func=sdot,ddot,cdotc,zdotc#
    #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
    #basetyp = npy_float, npy_double, npy_float,  npy_double#
    #ftyp = fortran_real, fortran_doublereal,
@@ -3494,7 +3511,7 @@ release_@lapack_func@(GEQRF_PARAMS_t* params)
    #cmplx = 0, 0, 1, 1#
  */
 static void
-@TYPE@_qr_r_raw_e(char **args, npy_intp const *dimensions, npy_intp const *steps,
+@TYPE@_qr_r_raw(char **args, npy_intp const *dimensions, npy_intp const *steps,
           void *NPY_UNUSED(func))
 {
     GEQRF_PARAMS_t params;
@@ -3507,10 +3524,9 @@ static void
     n = (fortran_int)dimensions[1];
 
     if (init_@lapack_func@(&params, m, n)) {
-        LINEARIZE_DATA_t a_in, a_out, tau_out;
+        LINEARIZE_DATA_t a_in, tau_out;
 
         init_linearize_data(&a_in, n, m, steps[1], steps[0]);
-        init_linearize_data(&a_out, m, n, steps[0], steps[1]);
         init_linearize_data(&tau_out, 1, fortran_int_min(m, n), 1, steps[2]);
 
         BEGIN_OUTER_LOOP_2
@@ -3518,7 +3534,7 @@ static void
             linearize_@TYPE@_matrix(params.A, args[0], &a_in);
             not_ok = call_@lapack_func@(&params);
             if (!not_ok) {
-                delinearize_@TYPE@_matrix(args[0], params.A, &a_out);
+                delinearize_@TYPE@_matrix(args[0], params.A, &a_in);
                 delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_out);
             } else {
                 error_occurred = 1;
@@ -3535,6 +3551,297 @@ static void
 
 /**end repeat**/
 
+/* -------------------------------------------------------------------------- */
+                 /* qr (modes - reduced) */
+
+typedef struct gqr_params_struct
+{
+    fortran_int M;
+    fortran_int MC;
+    fortran_int MN;
+    void *Q;
+    fortran_int LDA;
+    void* TAU;
+    void *WORK;
+    fortran_int LWORK;
+} GQR_PARAMS_t;
+
+
+static inline void
+dump_gqr_params(const char *name,
+                GQR_PARAMS_t *params)
+{
+    TRACE_TXT("\n%s:\n"\
+
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n",
+
+              name,
+
+              "Q", params->Q,
+              "TAU", params->TAU,
+              "WORK", params->WORK,
+
+              "M", (int)params->M,
+              "MC", (int)params->MC,
+              "MN", (int)params->MN,
+              "LDA", (int)params->LDA,
+              "LWORK", (int)params->LWORK);
+}
+
+/**begin repeat
+   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
+ */
+
+static inline fortran_int
+call_@lapack_func@(GQR_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->M, &params->MC, &params->MN,
+                          params->Q, &params->LDA,
+                          params->TAU,
+                          params->WORK, &params->LWORK,
+                          &rv);
+    return rv;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE#
+   #lapack_func=sorgqr,dorgqr#
+   #ftyp=fortran_real,fortran_doublereal#
+ */
+static inline int
+init_@lapack_func@(GQR_PARAMS_t *params,
+                   void* a,
+                   fortran_int m,
+                   fortran_int n)
+{
+    printf("%zu %zu\n", m, n);
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *q, *tau, *work;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+
+    size_t q_size = safe_m * safe_min_m_n * sizeof(@ftyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    fortran_int lda = fortran_int_max(1, m);
+
+    mem_buff = malloc(q_size + tau_size);
+
+    if (!mem_buff)
+        goto error;
+
+    q = mem_buff;
+    memcpy(q, a, q_size);
+    tau = q + q_size;
+
+
+    params->M = m;
+    params->MC = min_m_n;
+    params->MN = min_m_n;
+    params->Q = q;
+    params->TAU = tau;
+    params->LDA = lda;
+
+    {
+        /* compute optimal work size */
+        /* IS THE FOLLOWING METHOD CORRECT? */
+        @ftyp@ work_size_query;
+
+        params->WORK = &work_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int)work_size_query;
+
+        work_size  = (size_t) work_size_query * sizeof(@ftyp@);
+    }
+
+    mem_buff2 = malloc(work_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+    memset(work, 0, work_size);
+
+    params->WORK = work;
+    params->LWORK = work_count;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=CFLOAT,CDOUBLE#
+   #lapack_func=cungqr,zungqr#
+   #ftyp=fortran_complex,fortran_doublecomplex#
+ */
+static inline int
+init_@lapack_func@(GQR_PARAMS_t *params,
+                   void* a,
+                   fortran_int m,
+                   fortran_int n)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *q, *tau, *work;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+
+    size_t q_size = safe_m * safe_min_m_n * sizeof(@ftyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    fortran_int lda = fortran_int_max(1, m);
+
+    mem_buff = malloc(q_size + tau_size);
+
+    if (!mem_buff)
+        goto error;
+
+    q = mem_buff;
+    memcpy(q, a, q_size);
+    tau = q + q_size;
+
+
+    params->M = m;
+    params->MC = min_m_n;
+    params->MN = min_m_n;
+    params->Q = q;
+    params->TAU = tau;
+    params->LDA = lda;
+
+    {
+        /* compute optimal work size */
+        /* IS THE FOLLOWING METHOD CORRECT? */
+        @ftyp@ work_size_query;
+
+        params->WORK = &work_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int)work_size_query.r;
+
+        work_size  = (size_t) work_size_query.r * sizeof(@ftyp@);
+    }
+
+    mem_buff2 = malloc(work_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+    memset(work, 0, work_size);
+
+    params->WORK = work;
+    params->LWORK = work_count;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
+ */
+static inline void
+release_@lapack_func@(GQR_PARAMS_t* params)
+{
+    /* A and WORK contain allocated blocks */
+    free(params->Q);
+    free(params->WORK);
+    memset(params, 0, sizeof(*params));
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
+   #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
+   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
+   #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
+   #basetyp = npy_float, npy_double, npy_float,  npy_double#
+   #ftyp = fortran_real, fortran_doublereal,
+           fortran_complex, fortran_doublecomplex#
+   #cmplx = 0, 0, 1, 1#
+ */
+static void
+@TYPE@_qr_reduced(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    GQR_PARAMS_t params;
+    int error_occurred = get_fp_invalid_and_clear();
+    fortran_int n, m;
+
+    INIT_OUTER_LOOP_3
+
+    m = (fortran_int)dimensions[0];
+    n = (fortran_int)dimensions[1];
+
+
+    if (init_@lapack_func@(&params, (void*)args[0], m, n)) {
+        LINEARIZE_DATA_t tau_in, q_out;
+
+        init_linearize_data(&tau_in, 1, fortran_int_min(m, n), 1, steps[2]);
+        init_linearize_data(&q_out, fortran_int_min(m, n), m, steps[4], steps[3]);
+
+        BEGIN_OUTER_LOOP_3
+            int not_ok;
+            linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_in);
+                delinearize_@TYPE@_matrix(args[2], params.Q, &q_out);
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[1], &tau_in);
+                nan_@TYPE@_matrix(args[2], &q_out);
+            }
+        END_OUTER_LOOP
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+/**end repeat**/
+
 #pragma GCC diagnostic pop
 
 /* -------------------------------------------------------------------------- */
@@ -3606,7 +3913,8 @@ GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_N);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_S);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(lstsq);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(qr_r_raw_e);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(qr_r_raw);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(qr_reduced);
 GUFUNC_FUNC_ARRAY_EIG(eig);
 GUFUNC_FUNC_ARRAY_EIG(eigvals);
 
@@ -3680,13 +3988,20 @@ static char lstsq_types[] = {
     NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_INT, NPY_DOUBLE,
 };
 
-static char qr_r_raw_e_types[] = {
+static char qr_r_raw_types[] = {
     NPY_FLOAT,   NPY_FLOAT,
     NPY_DOUBLE,  NPY_DOUBLE,
     NPY_CFLOAT,  NPY_CFLOAT,
     NPY_CDOUBLE, NPY_CDOUBLE,
 };
 
+static char qr_reduced_types[] = {
+    NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,
+    NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE,
+    NPY_CFLOAT,  NPY_CFLOAT,  NPY_CFLOAT,
+    NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
+};
+
 typedef struct gufunc_descriptor_struct {
     char *name;
     char *signature;
@@ -3897,22 +4212,40 @@ GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
         lstsq_types
     },
     {
-        "qr_r_raw_e_m",
+        "qr_r_raw_m",
         "(m,n)->(m)",
         "Compute TAU vector for the last two dimensions \n"\
         "and broadcast to the rest. For m <= n. \n",
         4, 1, 1,
-        FUNC_ARRAY_NAME(qr_r_raw_e),
-        qr_r_raw_e_types
+        FUNC_ARRAY_NAME(qr_r_raw),
+        qr_r_raw_types
     },
     {
-        "qr_r_raw_e_n",
+        "qr_r_raw_n",
         "(m,n)->(n)",
         "Compute TAU vector for the last two dimensions \n"\
         "and broadcast to the rest. For m > n. \n",
         4, 1, 1,
-        FUNC_ARRAY_NAME(qr_r_raw_e),
-        qr_r_raw_e_types
+        FUNC_ARRAY_NAME(qr_r_raw),
+        qr_r_raw_types
+    },
+    {
+        "qr_reduced_m",
+        "(m,n),(m)->(m,m)",
+        "Compute Q matrix for the last two dimensions \n"\
+        "and broadcast to the rest. For m <= n. \n",
+        4, 2, 1,
+        FUNC_ARRAY_NAME(qr_reduced),
+        qr_reduced_types
+    },
+    {
+        "qr_reduced_n",
+        "(m,n),(n)->(m,n)",
+        "Compute Q matrix for the last two dimensions \n"\
+        "and broadcast to the rest. For m > n. \n",
+        4, 2, 1,
+        FUNC_ARRAY_NAME(qr_reduced),
+        qr_reduced_types
     }
 };
 

From a3b58bae0f5bb19b294669a6540e079885529fd6 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Fri, 4 Jun 2021 15:05:30 +0530
Subject: [PATCH 03/26] ready for formal addition of tests and documentation
 update

---
 numpy/linalg/linalg.py          |  56 ++-----
 numpy/linalg/umath_linalg.c.src | 279 ++++++++++++++++++++++++++++++--
 2 files changed, 280 insertions(+), 55 deletions(-)

diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 97d130894182..2031014ac7fb 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -769,12 +769,12 @@ def cholesky(a):
 
 # QR decomposition
 
-def _qr_dispatcher(a, mode=None, old=None):
+def _qr_dispatcher(a, mode=None):
     return (a,)
 
 
 @array_function_dispatch(_qr_dispatcher)
-def qr(a, mode='reduced', old=True):
+def qr(a, mode='reduced'):
     """
     Compute the qr factorization of a matrix.
 
@@ -925,65 +925,33 @@ def qr(a, mode='reduced', old=True):
 
     # handle modes that don't return q
     if mode == 'r':
-        return wrap(triu(a.T))
+        return wrap(triu(a))
 
     if mode == 'raw':
-        return wrap(a.T), tau
+        return wrap(transpose(a)), tau
 
     if mode == 'economic':
         if t != result_t :
             a = a.astype(result_t, copy=False)
         return wrap(a)
 
-    if old:
-        #  generate q from a
-        if mode == 'complete' and m > n:
-            mc = m
-            q = empty((m, m), t)
-        else:
-            mc = mn
-            q = empty((m, n), t)
-        q[:n] = a
-
-        if isComplexType(t):
-            lapack_routine = lapack_lite.zungqr
-            routine_name = 'zungqr'
+    if mode == 'complete' and  m > n:
+        mc = m
+        if m <= n:
+            gufunc = _umath_linalg.qr_complete_m
         else:
-            lapack_routine = lapack_lite.dorgqr
-            routine_name = 'dorgqr'
-
-        # determine optimal lwork
-        lwork = 1
-        work = zeros((lwork,), t)
-        results = lapack_routine(m, mc, mn, q, max(1, m), tau, work, -1, 0)
-        if results['info'] != 0:
-            raise LinAlgError('%s returns %d' % (routine_name, results['info']))
-
-        # compute q
-        lwork = max(1, n, int(abs(work[0])))
-        work = zeros((lwork,), t)
-        results = lapack_routine(m, mc, mn, q, max(1, m), tau, work, lwork, 0)
-        if results['info'] != 0:
-            raise LinAlgError('%s returns %d' % (routine_name, results['info']))
-
-        q = _fastCopyAndTranspose(result_t, q[:mc])
-        r = _fastCopyAndTranspose(result_t, a[:, :mc])
-
-        return wrap(q), wrap(triu(r))
-
-    if mode == 'reduced':
+            gufunc = _umath_linalg.qr_complete_n
+    else:
+        mc = mn
         if m <= n:
             gufunc = _umath_linalg.qr_reduced_m
         else:
             gufunc = _umath_linalg.qr_reduced_n
     
-    print(gufunc)
     signature = 'DD->D' if isComplexType(t) else 'dd->d'
     extobj = get_linalg_error_extobj(_raise_linalgerror_qr_r_raw)
     q = gufunc(a, tau, signature=signature, extobj=extobj)
-
-    r = _fastCopyAndTranspose(result_t, a[:, :mn])
-    return q, wrap(triu(r))
+    return wrap(q), wrap(triu(a[:, :mc]))
 
 # Eigenvalues
 
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 045b70003118..c58fbaa42ad6 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -3559,6 +3559,7 @@ typedef struct gqr_params_struct
     fortran_int M;
     fortran_int MC;
     fortran_int MN;
+    void* A;
     void *Q;
     fortran_int LDA;
     void* TAU;
@@ -3620,19 +3621,18 @@ call_@lapack_func@(GQR_PARAMS_t *params)
  */
 static inline int
 init_@lapack_func@(GQR_PARAMS_t *params,
-                   void* a,
                    fortran_int m,
                    fortran_int n)
 {
-    printf("%zu %zu\n", m, n);
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
-    npy_uint8 *q, *tau, *work;
+    npy_uint8 *a, *q, *tau, *work;
     fortran_int min_m_n = fortran_int_min(m, n);
     size_t safe_min_m_n = min_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
 
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
     size_t q_size = safe_m * safe_min_m_n * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
@@ -3640,19 +3640,20 @@ init_@lapack_func@(GQR_PARAMS_t *params,
     size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
 
-    mem_buff = malloc(q_size + tau_size);
+    mem_buff = malloc(q_size + tau_size + a_size);
 
     if (!mem_buff)
         goto error;
 
     q = mem_buff;
-    memcpy(q, a, q_size);
     tau = q + q_size;
+    a = tau + tau_size;
 
 
     params->M = m;
     params->MC = min_m_n;
     params->MN = min_m_n;
+    params->A = a;
     params->Q = q;
     params->TAU = tau;
     params->LDA = lda;
@@ -3702,18 +3703,18 @@ init_@lapack_func@(GQR_PARAMS_t *params,
  */
 static inline int
 init_@lapack_func@(GQR_PARAMS_t *params,
-                   void* a,
                    fortran_int m,
                    fortran_int n)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
-    npy_uint8 *q, *tau, *work;
+    npy_uint8 *a, *q, *tau, *work;
     fortran_int min_m_n = fortran_int_min(m, n);
     size_t safe_min_m_n = min_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
 
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
     size_t q_size = safe_m * safe_min_m_n * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
@@ -3721,19 +3722,20 @@ init_@lapack_func@(GQR_PARAMS_t *params,
     size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
 
-    mem_buff = malloc(q_size + tau_size);
+    mem_buff = malloc(q_size + tau_size + a_size);
 
     if (!mem_buff)
         goto error;
 
     q = mem_buff;
-    memcpy(q, a, q_size);
     tau = q + q_size;
+    a = tau + tau_size;
 
 
     params->M = m;
     params->MC = min_m_n;
     params->MN = min_m_n;
+    params->A = a;
     params->Q = q;
     params->TAU = tau;
     params->LDA = lda;
@@ -3814,14 +3816,243 @@ static void
     n = (fortran_int)dimensions[1];
 
 
-    if (init_@lapack_func@(&params, (void*)args[0], m, n)) {
-        LINEARIZE_DATA_t tau_in, q_out;
+    if (init_@lapack_func@(&params, m, n)) {
+        LINEARIZE_DATA_t a_in, tau_in, q_out;
 
+        init_linearize_data(&a_in, n, m, steps[1], steps[0]);
         init_linearize_data(&tau_in, 1, fortran_int_min(m, n), 1, steps[2]);
         init_linearize_data(&q_out, fortran_int_min(m, n), m, steps[4], steps[3]);
 
         BEGIN_OUTER_LOOP_3
             int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            size_t safe_min_m_n = fortran_int_min(m, n);
+            size_t safe_m = m;
+            memcpy(params.Q, params.A, safe_min_m_n * safe_m * sizeof(@ftyp@));
+            linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_in);
+                delinearize_@TYPE@_matrix(args[2], params.Q, &q_out);
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[1], &tau_in);
+                nan_@TYPE@_matrix(args[2], &q_out);
+            }
+        END_OUTER_LOOP
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+/**end repeat**/
+
+/* -------------------------------------------------------------------------- */
+                 /* qr (modes - complete) */
+
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE#
+   #lapack_func=sorgqr,dorgqr#
+   #ftyp=fortran_real,fortran_doublereal#
+ */
+static inline int
+init_@lapack_func@_complete(GQR_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *q, *tau, *work;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t q_size = safe_m * safe_min_m_n * sizeof(@ftyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    fortran_int lda = fortran_int_max(1, m);
+
+    mem_buff = malloc(q_size + tau_size + a_size);
+
+    if (!mem_buff)
+        goto error;
+
+    q = mem_buff;
+    tau = q + q_size;
+    a = tau + tau_size;
+
+
+    params->M = m;
+    params->MC = min_m_n;
+    params->MN = min_m_n;
+    params->A = a;
+    params->Q = q;
+    params->TAU = tau;
+    params->LDA = lda;
+
+    {
+        /* compute optimal work size */
+        /* IS THE FOLLOWING METHOD CORRECT? */
+        @ftyp@ work_size_query;
+
+        params->WORK = &work_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int)work_size_query;
+
+        work_size  = (size_t) work_size_query * sizeof(@ftyp@);
+    }
+
+    mem_buff2 = malloc(work_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+    memset(work, 0, work_size);
+
+    params->WORK = work;
+    params->LWORK = work_count;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=CFLOAT,CDOUBLE#
+   #lapack_func=cungqr,zungqr#
+   #ftyp=fortran_complex,fortran_doublecomplex#
+ */
+static inline int
+init_@lapack_func@_complete(GQR_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *q, *tau, *work;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t q_size = safe_m * safe_m * sizeof(@ftyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    fortran_int lda = fortran_int_max(1, m);
+
+    mem_buff = malloc(q_size + tau_size + a_size);
+
+    if (!mem_buff)
+        goto error;
+
+    q = mem_buff;
+    tau = q + q_size;
+    a = tau + tau_size;
+
+
+    params->M = m;
+    params->MC = m;
+    params->MN = min_m_n;
+    params->A = a;
+    params->Q = q;
+    params->TAU = tau;
+    params->LDA = lda;
+
+    {
+        /* compute optimal work size */
+        /* IS THE FOLLOWING METHOD CORRECT? */
+        @ftyp@ work_size_query;
+
+        params->WORK = &work_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int)work_size_query.r;
+
+        work_size  = (size_t) work_size_query.r * sizeof(@ftyp@);
+    }
+
+    mem_buff2 = malloc(work_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+    memset(work, 0, work_size);
+
+    params->WORK = work;
+    params->LWORK = work_count;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
+   #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
+   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
+   #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
+   #basetyp = npy_float, npy_double, npy_float,  npy_double#
+   #ftyp = fortran_real, fortran_doublereal,
+           fortran_complex, fortran_doublecomplex#
+   #cmplx = 0, 0, 1, 1#
+ */
+static void
+@TYPE@_qr_complete(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    GQR_PARAMS_t params;
+    int error_occurred = get_fp_invalid_and_clear();
+    fortran_int n, m;
+
+    INIT_OUTER_LOOP_3
+
+    m = (fortran_int)dimensions[0];
+    n = (fortran_int)dimensions[1];
+
+
+    if (init_@lapack_func@_complete(&params, m, n)) {
+        LINEARIZE_DATA_t a_in, tau_in, q_out;
+
+        init_linearize_data(&a_in, n, m, steps[1], steps[0]);
+        init_linearize_data(&tau_in, 1, fortran_int_min(m, n), 1, steps[2]);
+        init_linearize_data(&q_out, m, m, steps[4], steps[3]);
+
+        BEGIN_OUTER_LOOP_3
+            int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            size_t safe_n = n;
+            size_t safe_m = m;
+            memcpy(params.Q, params.A, safe_n * safe_m * sizeof(@ftyp@));
             linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
             not_ok = call_@lapack_func@(&params);
             if (!not_ok) {
@@ -3915,6 +4146,7 @@ GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(lstsq);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(qr_r_raw);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(qr_reduced);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(qr_complete);
 GUFUNC_FUNC_ARRAY_EIG(eig);
 GUFUNC_FUNC_ARRAY_EIG(eigvals);
 
@@ -4002,6 +4234,13 @@ static char qr_reduced_types[] = {
     NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
 };
 
+static char qr_complete_types[] = {
+    NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,
+    NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE,
+    NPY_CFLOAT,  NPY_CFLOAT,  NPY_CFLOAT,
+    NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
+};
+
 typedef struct gufunc_descriptor_struct {
     char *name;
     char *signature;
@@ -4246,6 +4485,24 @@ GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
         4, 2, 1,
         FUNC_ARRAY_NAME(qr_reduced),
         qr_reduced_types
+    },
+    {
+        "qr_complete_m",
+        "(m,n),(m)->(m,m)",
+        "Compute Q matrix for the last two dimensions \n"\
+        "and broadcast to the rest. For m <= n. \n",
+        4, 2, 1,
+        FUNC_ARRAY_NAME(qr_complete),
+        qr_complete_types
+    },
+    {
+        "qr_complete_n",
+        "(m,n),(n)->(m,m)",
+        "Compute Q matrix for the last two dimensions \n"\
+        "and broadcast to the rest. For m > n. \n",
+        4, 2, 1,
+        FUNC_ARRAY_NAME(qr_complete),
+        qr_complete_types
     }
 };
 

From 885ba91db5132d3a4b1dfd5f60234bfedb08b432 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Sat, 5 Jun 2021 08:29:41 +0530
Subject: [PATCH 04/26] moving intialization outside the loop

---
 numpy/linalg/umath_linalg.c.src | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index c58fbaa42ad6..a2f463d248b3 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -3809,12 +3809,14 @@ static void
     GQR_PARAMS_t params;
     int error_occurred = get_fp_invalid_and_clear();
     fortran_int n, m;
+    size_t safe_min_m_n, safe_m;
 
     INIT_OUTER_LOOP_3
 
     m = (fortran_int)dimensions[0];
     n = (fortran_int)dimensions[1];
-
+    safe_min_m_n = fortran_int_min(m, n);
+    safe_m = m;
 
     if (init_@lapack_func@(&params, m, n)) {
         LINEARIZE_DATA_t a_in, tau_in, q_out;
@@ -3826,8 +3828,6 @@ static void
         BEGIN_OUTER_LOOP_3
             int not_ok;
             linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            size_t safe_min_m_n = fortran_int_min(m, n);
-            size_t safe_m = m;
             memcpy(params.Q, params.A, safe_min_m_n * safe_m * sizeof(@ftyp@));
             linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
             not_ok = call_@lapack_func@(&params);
@@ -4033,11 +4033,14 @@ static void
     GQR_PARAMS_t params;
     int error_occurred = get_fp_invalid_and_clear();
     fortran_int n, m;
+    size_t safe_n, safe_m;
 
     INIT_OUTER_LOOP_3
 
     m = (fortran_int)dimensions[0];
     n = (fortran_int)dimensions[1];
+    safe_n = n;
+    safe_m = m;
 
 
     if (init_@lapack_func@_complete(&params, m, n)) {
@@ -4050,8 +4053,6 @@ static void
         BEGIN_OUTER_LOOP_3
             int not_ok;
             linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            size_t safe_n = n;
-            size_t safe_m = m;
             memcpy(params.Q, params.A, safe_n * safe_m * sizeof(@ftyp@));
             linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
             not_ok = call_@lapack_func@(&params);

From c33cf14e144f8f3f8c226e81bbf693cdba7b3187 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Sat, 5 Jun 2021 09:00:45 +0530
Subject: [PATCH 05/26] existing tests passed

---
 numpy/linalg/linalg.py          | 19 ++++++++--
 numpy/linalg/umath_linalg.c.src | 62 ++++++++++++++++++++++-----------
 2 files changed, 57 insertions(+), 24 deletions(-)

diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 2031014ac7fb..631f5b3690d8 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -925,10 +925,17 @@ def qr(a, mode='reduced'):
 
     # handle modes that don't return q
     if mode == 'r':
-        return wrap(triu(a))
+        r = triu(a[..., :mn, :])
+        if t != result_t:
+            r = r.astype(result_t, copy=False)
+        return wrap(r)
 
     if mode == 'raw':
-        return wrap(transpose(a)), tau
+        q = transpose(a)
+        if t != result_t:
+            q = q.astype(result_t, copy=False)
+            tau = tau.astype(result_t, copy=False)
+        return wrap(q), tau
 
     if mode == 'economic':
         if t != result_t :
@@ -951,7 +958,13 @@ def qr(a, mode='reduced'):
     signature = 'DD->D' if isComplexType(t) else 'dd->d'
     extobj = get_linalg_error_extobj(_raise_linalgerror_qr_r_raw)
     q = gufunc(a, tau, signature=signature, extobj=extobj)
-    return wrap(q), wrap(triu(a[:, :mc]))
+    r = triu(a[..., :mc, :])
+
+    if t != result_t:
+        q = q.astype(result_t, copy=False)
+        r = r.astype(result_t, copy=False)
+
+    return wrap(q), wrap(r)
 
 # Eigenvalues
 
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index a2f463d248b3..1f794a1e3e23 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -3372,6 +3372,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
     {
         /* compute optimal work size */
         /* IS THE FOLLOWING METHOD CORRECT? */
+
         @ftyp@ work_size_query;
 
         params->WORK = &work_size_query;
@@ -3380,11 +3381,14 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int)work_size_query;
+        work_count = (fortran_int) *(@ftyp@*) params->WORK;
 
-        work_size  = (size_t) work_size_query * sizeof(@ftyp@);
     }
 
+    params->LWORK = fortran_int_max(fortran_int_max(1, n), 
+                                    work_count > 0 ? work_count : -work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
     mem_buff2 = malloc(work_size);
     if (!mem_buff2)
         goto error;
@@ -3393,7 +3397,6 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
     memset(work, 0, work_size);
 
     params->WORK = work;
-    params->LWORK = work_count;
 
     return 1;
  error:
@@ -3451,6 +3454,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
     {
         /* compute optimal work size */
         /* IS THE FOLLOWING METHOD CORRECT? */
+
         @ftyp@ work_size_query;
 
         params->WORK = &work_size_query;
@@ -3459,11 +3463,15 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int)work_size_query.r;
+        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
 
-        work_size  = (size_t) work_size_query.r * sizeof(@ftyp@);
     }
 
+    params->LWORK = fortran_int_max(fortran_int_max(1, n), 
+                                    work_count > 0 ? work_count : -work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+
     mem_buff2 = malloc(work_size);
     if (!mem_buff2)
         goto error;
@@ -3472,7 +3480,6 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
     memset(work, 0, work_size);
 
     params->WORK = work;
-    params->LWORK = work_count;
 
     return 1;
  error:
@@ -3631,7 +3638,6 @@ init_@lapack_func@(GQR_PARAMS_t *params,
     size_t safe_min_m_n = min_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
-
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
     size_t q_size = safe_m * safe_min_m_n * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
@@ -3669,11 +3675,15 @@ init_@lapack_func@(GQR_PARAMS_t *params,
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int)work_size_query;
+        work_count = (fortran_int) *(@ftyp@*) params->WORK;
 
-        work_size  = (size_t) work_size_query * sizeof(@ftyp@);
     }
 
+    params->LWORK = fortran_int_max(fortran_int_max(1, n), 
+                                    work_count > 0 ? work_count : -work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+
     mem_buff2 = malloc(work_size);
     if (!mem_buff2)
         goto error;
@@ -3682,7 +3692,6 @@ init_@lapack_func@(GQR_PARAMS_t *params,
     memset(work, 0, work_size);
 
     params->WORK = work;
-    params->LWORK = work_count;
 
     return 1;
  error:
@@ -3751,11 +3760,15 @@ init_@lapack_func@(GQR_PARAMS_t *params,
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int)work_size_query.r;
+        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
 
-        work_size  = (size_t) work_size_query.r * sizeof(@ftyp@);
     }
 
+    params->LWORK = fortran_int_max(fortran_int_max(1, n), 
+                                    work_count > 0 ? work_count : -work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+
     mem_buff2 = malloc(work_size);
     if (!mem_buff2)
         goto error;
@@ -3862,6 +3875,7 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
                    fortran_int m,
                    fortran_int n)
 {
+    printf("Inside zungqr\n");
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     npy_uint8 *a, *q, *tau, *work;
@@ -3871,7 +3885,7 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
     size_t safe_n = n;
 
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t q_size = safe_m * safe_min_m_n * sizeof(@ftyp@);
+    size_t q_size = safe_m * safe_m * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
     fortran_int work_count;
@@ -3889,7 +3903,7 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
 
 
     params->M = m;
-    params->MC = min_m_n;
+    params->MC = m;
     params->MN = min_m_n;
     params->A = a;
     params->Q = q;
@@ -3907,11 +3921,15 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int)work_size_query;
+        work_count = (fortran_int) *(@ftyp@*) params->WORK;
 
-        work_size  = (size_t) work_size_query * sizeof(@ftyp@);
     }
 
+    params->LWORK = fortran_int_max(fortran_int_max(1, n), 
+                                    work_count > 0 ? work_count : -work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+
     mem_buff2 = malloc(work_size);
     if (!mem_buff2)
         goto error;
@@ -3920,7 +3938,6 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
     memset(work, 0, work_size);
 
     params->WORK = work;
-    params->LWORK = work_count;
 
     return 1;
  error:
@@ -3941,8 +3958,8 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
  */
 static inline int
 init_@lapack_func@_complete(GQR_PARAMS_t *params,
-                   fortran_int m,
-                   fortran_int n)
+                            fortran_int m,
+                            fortran_int n)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
@@ -3989,11 +4006,14 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int)work_size_query.r;
+        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
 
-        work_size  = (size_t) work_size_query.r * sizeof(@ftyp@);
     }
 
+    params->LWORK = fortran_int_max(fortran_int_max(1, n), 
+                                    work_count > 0 ? work_count : -work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
     mem_buff2 = malloc(work_size);
     if (!mem_buff2)
         goto error;

From b6cd5b2efb6d099d1d9be01697143c59bb1491eb Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Sat, 5 Jun 2021 11:46:03 +0530
Subject: [PATCH 06/26] tests for stacked inputs added

---
 numpy/linalg/tests/test_linalg.py | 61 +++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index c6e8cdd039f1..6e03f8b9b5e6 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -1,6 +1,7 @@
 """ Test functions for linalg module
 
 """
+from numpy.core.fromnumeric import shape
 import os
 import sys
 import itertools
@@ -11,6 +12,7 @@
 
 import numpy as np
 from numpy import array, single, double, csingle, cdouble, dot, identity, matmul
+from numpy.core import swapaxes
 from numpy import multiply, atleast_2d, inf, asarray
 from numpy import linalg
 from numpy.linalg import matrix_power, norm, matrix_rank, multi_dot, LinAlgError
@@ -1709,7 +1711,66 @@ def test_mode_all_but_economic(self):
             self.check_qr(m1)
             self.check_qr(m2)
             self.check_qr(m2.T)
+    
+    def check_qr_stacked(self, a):
+        # This test expects the argument `a` to be an ndarray or
+        # a subclass of an ndarray of inexact type.
+        a_type = type(a)
+        a_dtype = a.dtype
+        m, n = a.shape[-2:]
+        k = min(m, n)
+
+        # mode == 'complete'
+        q, r = linalg.qr(a, mode='complete')
+        assert_(q.dtype == a_dtype)
+        assert_(r.dtype == a_dtype)
+        assert_(isinstance(q, a_type))
+        assert_(isinstance(r, a_type))
+        assert_(q.shape[-2:] == (m, m))
+        assert_(r.shape[-2:] == (m, n))
+        assert_almost_equal(matmul(q, r), a)
+        assert_almost_equal(swapaxes(q, -1, -2).conj(), np.linalg.inv(q))
+        assert_almost_equal(np.triu(r[..., :, :]), r)
+
+        # mode == 'reduced'
+        q1, r1 = linalg.qr(a, mode='reduced')
+        assert_(q1.dtype == a_dtype)
+        assert_(r1.dtype == a_dtype)
+        assert_(isinstance(q1, a_type))
+        assert_(isinstance(r1, a_type))
+        assert_(q1.shape[-2:] == (m, k))
+        assert_(r1.shape[-2:] == (k, n))
+        assert_almost_equal(matmul(q1, r1), a)
+        assert_almost_equal(swapaxes(q, -1, -2).conj(), np.linalg.inv(q))
+        assert_almost_equal(np.triu(r1[..., :, :]), r1)
+
+        # mode == 'r'
+        r2 = linalg.qr(a, mode='r')
+        assert_(r2.dtype == a_dtype)
+        assert_(isinstance(r2, a_type))
+        assert_almost_equal(r2, r1)
 
+    def test_stacked_inputs(self):
+
+        curr_state = np.random.get_state()
+        np.random.seed(0)
+
+        normal = np.random.normal
+        sizes = [(3, 4), (4, 3), (4, 4), (3, 0), (0, 3)]
+        dts = [np.float32, np.float64, np.complex64]
+        for size in sizes:
+            for dt in dts:
+                a1, a2, a3, a4 = [normal(size=size), normal(size=size),
+                                  normal(size=size), normal(size=size)]
+                b1, b2, b3, b4 = [normal(size=size), normal(size=size),
+                                  normal(size=size), normal(size=size)]
+                A = np.asarray([[a1, a2], [a3, a4]], dtype=dt)
+                B = np.asarray([[b1, b2], [b3, b4]], dtype=dt)
+                self.check_qr_stacked(A)
+                self.check_qr_stacked(B)
+                self.check_qr_stacked(A + 1.j*B) 
+
+        np.random.set_state(curr_state)  
 
 class TestCholesky:
     # TODO: are there no other tests for cholesky?

From 4da4e5ef8ec198d70d1ed089b7ff829338a3c2a4 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Sat, 5 Jun 2021 11:57:48 +0530
Subject: [PATCH 07/26] documentation updated

---
 numpy/linalg/linalg.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 631f5b3690d8..a3c09038d7ef 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -783,15 +783,15 @@ def qr(a, mode='reduced'):
 
     Parameters
     ----------
-    a : array_like, shape (M, N)
-        Matrix to be factored.
+    a : array_like, shape (..., M, N)
+        A real or complex array with ``a.ndim >= 2``.
     mode : {'reduced', 'complete', 'r', 'raw'}, optional
         If K = min(M, N), then
 
-        * 'reduced'  : returns q, r with dimensions (M, K), (K, N) (default)
-        * 'complete' : returns q, r with dimensions (M, M), (M, N)
-        * 'r'        : returns r only with dimensions (K, N)
-        * 'raw'      : returns h, tau with dimensions (N, M), (K,)
+        * 'reduced'  : returns q, r with dimensions (..., M, K), (..., K, N) (default)
+        * 'complete' : returns q, r with dimensions (..., M, M), (..., M, N)
+        * 'r'        : returns r only with dimensions (..., K, N)
+        * 'raw'      : returns h, tau with dimensions (..., N, M), (..., K,)
 
         The options 'reduced', 'complete, and 'raw' are new in numpy 1.8,
         see the notes for more information. The default is 'reduced', and to
@@ -810,9 +810,13 @@ def qr(a, mode='reduced'):
         A matrix with orthonormal columns. When mode = 'complete' the
         result is an orthogonal/unitary matrix depending on whether or not
         a is real/complex. The determinant may be either +/- 1 in that
-        case.
+        case. In case the number of dimensions in the input array is 
+        greater than 2 then a stack of the matrices with above properties 
+        is returned.
     r : ndarray of float or complex, optional
-        The upper-triangular matrix.
+        The upper-triangular matrix or a stack of upper-triangular
+        matrices if the number of dimensions in the input array is greater
+        than 2.
     (h, tau) : ndarrays of np.double or np.cdouble, optional
         The array h contains the Householder reflectors that generate q
         along with r. The tau array contains scaling factors for the
@@ -860,6 +864,14 @@ def qr(a, mode='reduced'):
     >>> r2 = np.linalg.qr(a, mode='r')
     >>> np.allclose(r, r2)  # mode='r' returns the same r as mode='full'
     True
+    >>> a = np.random.normal(size=(3, 2, 2)) # Stack of 2 x 2 matrices as input
+    >>> q, r = np.linalg.qr(a)
+    >>> q.shape
+    >>> (3, 2, 2)
+    >>> r.shape
+    >>> (3, 2, 2)
+    >>> np.allclose(a, np.matmul(q, r))
+    True
 
     Example illustrating a common use of `qr`: solving of least squares
     problems

From 89ff2af6e13f88b9e6f5beb9c11ab8d579b63f1c Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Sat, 5 Jun 2021 12:13:41 +0530
Subject: [PATCH 08/26] linting issue fixed

---
 numpy/linalg/linalg.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index a3c09038d7ef..4bd142c83c98 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -100,7 +100,8 @@ def _raise_linalgerror_lstsq(err, flag):
     raise LinAlgError("SVD did not converge in Linear Least Squares")
 
 def _raise_linalgerror_qr_r_raw(err, flag):
-    raise LinAlgError("Illegal argument found while performing QR factorization")
+    raise LinAlgError("Illegal argument found while performing "
+                      "QR factorization")
 
 def get_linalg_error_extobj(callback):
     extobj = list(_linalg_error_extobj)  # make a copy

From 4628497abbcc305b72706109d885d1bfd82c6ea8 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Sat, 5 Jun 2021 12:17:32 +0530
Subject: [PATCH 09/26] removed trailing white spaces

---
 numpy/linalg/linalg.py            |  8 ++++----
 numpy/linalg/tests/test_linalg.py |  6 +++---
 numpy/linalg/umath_linalg.c.src   | 28 ++++++++++++++--------------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 4bd142c83c98..2eca1113d15a 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -811,8 +811,8 @@ def qr(a, mode='reduced'):
         A matrix with orthonormal columns. When mode = 'complete' the
         result is an orthogonal/unitary matrix depending on whether or not
         a is real/complex. The determinant may be either +/- 1 in that
-        case. In case the number of dimensions in the input array is 
-        greater than 2 then a stack of the matrices with above properties 
+        case. In case the number of dimensions in the input array is
+        greater than 2 then a stack of the matrices with above properties
         is returned.
     r : ndarray of float or complex, optional
         The upper-triangular matrix or a stack of upper-triangular
@@ -967,7 +967,7 @@ def qr(a, mode='reduced'):
             gufunc = _umath_linalg.qr_reduced_m
         else:
             gufunc = _umath_linalg.qr_reduced_n
-    
+
     signature = 'DD->D' if isComplexType(t) else 'dd->d'
     extobj = get_linalg_error_extobj(_raise_linalgerror_qr_r_raw)
     q = gufunc(a, tau, signature=signature, extobj=extobj)
@@ -2174,7 +2174,7 @@ def lstsq(a, b, rcond="warn"):
     equal to, or greater than its number of linearly independent columns).
     If `a` is square and of full rank, then `x` (but for round-off error)
     is the "exact" solution of the equation. Else, `x` minimizes the
-    Euclidean 2-norm :math:`||b - ax||`. If there are multiple minimizing 
+    Euclidean 2-norm :math:`||b - ax||`. If there are multiple minimizing
     solutions, the one with the smallest 2-norm :math:`||x||` is returned.
 
     Parameters
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index 6e03f8b9b5e6..5557ddf123f6 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -1711,7 +1711,7 @@ def test_mode_all_but_economic(self):
             self.check_qr(m1)
             self.check_qr(m2)
             self.check_qr(m2.T)
-    
+
     def check_qr_stacked(self, a):
         # This test expects the argument `a` to be an ndarray or
         # a subclass of an ndarray of inexact type.
@@ -1768,9 +1768,9 @@ def test_stacked_inputs(self):
                 B = np.asarray([[b1, b2], [b3, b4]], dtype=dt)
                 self.check_qr_stacked(A)
                 self.check_qr_stacked(B)
-                self.check_qr_stacked(A + 1.j*B) 
+                self.check_qr_stacked(A + 1.j*B)
 
-        np.random.set_state(curr_state)  
+        np.random.set_state(curr_state)
 
 class TestCholesky:
     # TODO: are there no other tests for cholesky?
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 1f794a1e3e23..f3cd79945a86 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -162,36 +162,36 @@ FNAME(zgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs,
               double rwork[], fortran_int iwork[],
               fortran_int *info);
 
-extern fortran_int 
+extern fortran_int
 FNAME(sgeqrf)(fortran_int *m, fortran_int *n, float a[], fortran_int *lda,
               float tau[], float work[],
               fortran_int *lwork, fortran_int *info);
-extern fortran_int 
+extern fortran_int
 FNAME(cgeqrf)(fortran_int *m, fortran_int *n, f2c_complex a[], fortran_int *lda,
               f2c_complex tau[], f2c_complex work[],
               fortran_int *lwork, fortran_int *info);
-extern fortran_int 
+extern fortran_int
 FNAME(dgeqrf)(fortran_int *m, fortran_int *n, double a[], fortran_int *lda,
               double tau[], double work[],
               fortran_int *lwork, fortran_int *info);
-extern fortran_int 
+extern fortran_int
 FNAME(zgeqrf)(fortran_int *m, fortran_int *n, f2c_doublecomplex a[], fortran_int *lda,
               f2c_doublecomplex tau[], f2c_doublecomplex work[],
               fortran_int *lwork, fortran_int *info);
 
-extern fortran_int 
+extern fortran_int
 FNAME(sorgqr)(fortran_int *m, fortran_int *n, fortran_int *k, float a[], fortran_int *lda,
               float tau[], float work[],
               fortran_int *lwork, fortran_int *info);
-extern fortran_int 
+extern fortran_int
 FNAME(dorgqr)(fortran_int *m, fortran_int *n, fortran_int *k, double a[], fortran_int *lda,
               double tau[], double work[],
               fortran_int *lwork, fortran_int *info);
-extern fortran_int 
+extern fortran_int
 FNAME(cungqr)(fortran_int *m, fortran_int *n, fortran_int *k, f2c_complex a[],
               fortran_int *lda, f2c_complex tau[],
               f2c_complex work[], fortran_int *lwork, fortran_int *info);
-extern fortran_int 
+extern fortran_int
 FNAME(zungqr)(fortran_int *m, fortran_int *n, fortran_int *k, f2c_doublecomplex a[],
               fortran_int *lda, f2c_doublecomplex tau[],
               f2c_doublecomplex work[], fortran_int *lwork, fortran_int *info);
@@ -3385,7 +3385,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
 
     }
 
-    params->LWORK = fortran_int_max(fortran_int_max(1, n), 
+    params->LWORK = fortran_int_max(fortran_int_max(1, n),
                                     work_count > 0 ? work_count : -work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
@@ -3467,7 +3467,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
 
     }
 
-    params->LWORK = fortran_int_max(fortran_int_max(1, n), 
+    params->LWORK = fortran_int_max(fortran_int_max(1, n),
                                     work_count > 0 ? work_count : -work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
@@ -3679,7 +3679,7 @@ init_@lapack_func@(GQR_PARAMS_t *params,
 
     }
 
-    params->LWORK = fortran_int_max(fortran_int_max(1, n), 
+    params->LWORK = fortran_int_max(fortran_int_max(1, n),
                                     work_count > 0 ? work_count : -work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
@@ -3764,7 +3764,7 @@ init_@lapack_func@(GQR_PARAMS_t *params,
 
     }
 
-    params->LWORK = fortran_int_max(fortran_int_max(1, n), 
+    params->LWORK = fortran_int_max(fortran_int_max(1, n),
                                     work_count > 0 ? work_count : -work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
@@ -3925,7 +3925,7 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
 
     }
 
-    params->LWORK = fortran_int_max(fortran_int_max(1, n), 
+    params->LWORK = fortran_int_max(fortran_int_max(1, n),
                                     work_count > 0 ? work_count : -work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
@@ -4010,7 +4010,7 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
 
     }
 
-    params->LWORK = fortran_int_max(fortran_int_max(1, n), 
+    params->LWORK = fortran_int_max(fortran_int_max(1, n),
                                     work_count > 0 ? work_count : -work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);

From 9418a179402601291add7ebde3bfbedd54b22ef4 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Sat, 5 Jun 2021 13:17:59 +0530
Subject: [PATCH 10/26] resolved linting issues

---
 numpy/linalg/linalg.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 2eca1113d15a..af5dcd1a0440 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -789,7 +789,8 @@ def qr(a, mode='reduced'):
     mode : {'reduced', 'complete', 'r', 'raw'}, optional
         If K = min(M, N), then
 
-        * 'reduced'  : returns q, r with dimensions (..., M, K), (..., K, N) (default)
+        * 'reduced'  : returns q, r with dimensions
+                       (..., M, K), (..., K, N) (default)
         * 'complete' : returns q, r with dimensions (..., M, M), (..., M, N)
         * 'r'        : returns r only with dimensions (..., K, N)
         * 'raw'      : returns h, tau with dimensions (..., N, M), (..., K,)

From 0d490d65faf24557260a4b74a75106df04207eff Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Sat, 5 Jun 2021 13:20:31 +0530
Subject: [PATCH 11/26] resolved linting issues

---
 numpy/linalg/linalg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index af5dcd1a0440..372d6d8520a7 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -956,7 +956,7 @@ def qr(a, mode='reduced'):
             a = a.astype(result_t, copy=False)
         return wrap(a)
 
-    if mode == 'complete' and  m > n:
+    if mode == 'complete' and m > n:
         mc = m
         if m <= n:
             gufunc = _umath_linalg.qr_complete_m

From 3b64375d496e100ad9e510e88bf8dbc047add420 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Sat, 5 Jun 2021 13:54:34 +0530
Subject: [PATCH 12/26] documentation fix

---
 numpy/linalg/linalg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 372d6d8520a7..99d77120acd5 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -869,9 +869,9 @@ def qr(a, mode='reduced'):
     >>> a = np.random.normal(size=(3, 2, 2)) # Stack of 2 x 2 matrices as input
     >>> q, r = np.linalg.qr(a)
     >>> q.shape
-    >>> (3, 2, 2)
+    (3, 2, 2)
     >>> r.shape
-    >>> (3, 2, 2)
+    (3, 2, 2)
     >>> np.allclose(a, np.matmul(q, r))
     True
 

From cd66940acce58528f0597c59e8a2d2fe6d18925f Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Mon, 7 Jun 2021 11:24:39 +0530
Subject: [PATCH 13/26] removed debug prints and addressed review

---
 numpy/linalg/tests/test_linalg.py | 4 ----
 numpy/linalg/umath_linalg.c.src   | 1 -
 2 files changed, 5 deletions(-)

diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index 5557ddf123f6..51f10f27c76e 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -1752,9 +1752,6 @@ def check_qr_stacked(self, a):
 
     def test_stacked_inputs(self):
 
-        curr_state = np.random.get_state()
-        np.random.seed(0)
-
         normal = np.random.normal
         sizes = [(3, 4), (4, 3), (4, 4), (3, 0), (0, 3)]
         dts = [np.float32, np.float64, np.complex64]
@@ -1770,7 +1767,6 @@ def test_stacked_inputs(self):
                 self.check_qr_stacked(B)
                 self.check_qr_stacked(A + 1.j*B)
 
-        np.random.set_state(curr_state)
 
 class TestCholesky:
     # TODO: are there no other tests for cholesky?
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index f3cd79945a86..1a1a3babc2b6 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -3875,7 +3875,6 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
                    fortran_int m,
                    fortran_int n)
 {
-    printf("Inside zungqr\n");
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     npy_uint8 *a, *q, *tau, *work;

From 3e7baee083495ce0f648775f91771ded3c53775a Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Mon, 7 Jun 2021 13:18:57 +0530
Subject: [PATCH 14/26] Added release notes entry

---
 doc/release/upcoming_changes/19151.improvement.rst | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 doc/release/upcoming_changes/19151.improvement.rst

diff --git a/doc/release/upcoming_changes/19151.improvement.rst b/doc/release/upcoming_changes/19151.improvement.rst
new file mode 100644
index 000000000000..751cd8f75455
--- /dev/null
+++ b/doc/release/upcoming_changes/19151.improvement.rst
@@ -0,0 +1,6 @@
+`np.linalg.qr` accepts stacked matrices as inputs
+-------------------------------------------------
+
+`np.linalg.qr` is able to produce results for stacked matrices as inputs.
+Moreover, the implementation of QR decomposition has been shifted to C
+from Python.

From c2dc5f76d13a6a0b858f98757124b5ac1b4cba65 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Tue, 8 Jun 2021 08:50:35 +0530
Subject: [PATCH 15/26] Addressed reviews

---
 doc/release/upcoming_changes/19151.improvement.rst |  4 ++--
 numpy/linalg/linalg.py                             | 10 +++++++---
 numpy/linalg/umath_linalg.c.src                    |  3 +++
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/doc/release/upcoming_changes/19151.improvement.rst b/doc/release/upcoming_changes/19151.improvement.rst
index 751cd8f75455..e138b39bd1e0 100644
--- a/doc/release/upcoming_changes/19151.improvement.rst
+++ b/doc/release/upcoming_changes/19151.improvement.rst
@@ -1,6 +1,6 @@
-`np.linalg.qr` accepts stacked matrices as inputs
+`numpy.linalg.qr` accepts stacked matrices as inputs
 -------------------------------------------------
 
-`np.linalg.qr` is able to produce results for stacked matrices as inputs.
+`numpy.linalg.qr` is able to produce results for stacked matrices as inputs.
 Moreover, the implementation of QR decomposition has been shifted to C
 from Python.
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 99d77120acd5..4b3ae79f820b 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -99,7 +99,7 @@ def _raise_linalgerror_svd_nonconvergence(err, flag):
 def _raise_linalgerror_lstsq(err, flag):
     raise LinAlgError("SVD did not converge in Linear Least Squares")
 
-def _raise_linalgerror_qr_r_raw(err, flag):
+def _raise_linalgerror_qr(err, flag):
     raise LinAlgError("Illegal argument found while performing "
                       "QR factorization")
 
@@ -934,7 +934,7 @@ def qr(a, mode='reduced'):
         gufunc = _umath_linalg.qr_r_raw_n
 
     signature = 'D->D' if isComplexType(t) else 'd->d'
-    extobj = get_linalg_error_extobj(_raise_linalgerror_qr_r_raw)
+    extobj = get_linalg_error_extobj(_raise_linalgerror_qr)
     tau = gufunc(a, signature=signature, extobj=extobj)
 
     # handle modes that don't return q
@@ -956,6 +956,10 @@ def qr(a, mode='reduced'):
             a = a.astype(result_t, copy=False)
         return wrap(a)
 
+    # mc is the number of columns in the resulting q
+    # matrix. If the mode is complete then it is 
+    # same as number of rows, and if the mode is reduced,
+    # then it is the minimum of number of rows and columns.
     if mode == 'complete' and m > n:
         mc = m
         if m <= n:
@@ -970,7 +974,7 @@ def qr(a, mode='reduced'):
             gufunc = _umath_linalg.qr_reduced_n
 
     signature = 'DD->D' if isComplexType(t) else 'dd->d'
-    extobj = get_linalg_error_extobj(_raise_linalgerror_qr_r_raw)
+    extobj = get_linalg_error_extobj(_raise_linalgerror_qr)
     q = gufunc(a, tau, signature=signature, extobj=extobj)
     r = triu(a[..., :mc, :])
 
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 1a1a3babc2b6..5b56c95656a7 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -4240,6 +4240,7 @@ static char lstsq_types[] = {
     NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_INT, NPY_DOUBLE,
 };
 
+/* A, tau */
 static char qr_r_raw_types[] = {
     NPY_FLOAT,   NPY_FLOAT,
     NPY_DOUBLE,  NPY_DOUBLE,
@@ -4247,6 +4248,7 @@ static char qr_r_raw_types[] = {
     NPY_CDOUBLE, NPY_CDOUBLE,
 };
 
+/* A, tau, q */
 static char qr_reduced_types[] = {
     NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,
     NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE,
@@ -4254,6 +4256,7 @@ static char qr_reduced_types[] = {
     NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
 };
 
+/* A, tau, q */
 static char qr_complete_types[] = {
     NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,
     NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE,

From 752987488b57582ac7a8e455f701d34e1756bdc6 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Tue, 8 Jun 2021 09:31:00 +0530
Subject: [PATCH 16/26] fixed title line in release notes

---
 doc/release/upcoming_changes/19151.improvement.rst | 2 +-
 numpy/linalg/umath_linalg.c.src                    | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/doc/release/upcoming_changes/19151.improvement.rst b/doc/release/upcoming_changes/19151.improvement.rst
index e138b39bd1e0..2108b9c4f96d 100644
--- a/doc/release/upcoming_changes/19151.improvement.rst
+++ b/doc/release/upcoming_changes/19151.improvement.rst
@@ -1,5 +1,5 @@
 `numpy.linalg.qr` accepts stacked matrices as inputs
--------------------------------------------------
+----------------------------------------------------
 
 `numpy.linalg.qr` is able to produce results for stacked matrices as inputs.
 Moreover, the implementation of QR decomposition has been shifted to C
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 5b56c95656a7..47c2f04230ca 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -3371,7 +3371,6 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
 
     {
         /* compute optimal work size */
-        /* IS THE FOLLOWING METHOD CORRECT? */
 
         @ftyp@ work_size_query;
 
@@ -3453,7 +3452,6 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
 
     {
         /* compute optimal work size */
-        /* IS THE FOLLOWING METHOD CORRECT? */
 
         @ftyp@ work_size_query;
 
@@ -3666,7 +3664,6 @@ init_@lapack_func@(GQR_PARAMS_t *params,
 
     {
         /* compute optimal work size */
-        /* IS THE FOLLOWING METHOD CORRECT? */
         @ftyp@ work_size_query;
 
         params->WORK = &work_size_query;
@@ -3751,7 +3748,6 @@ init_@lapack_func@(GQR_PARAMS_t *params,
 
     {
         /* compute optimal work size */
-        /* IS THE FOLLOWING METHOD CORRECT? */
         @ftyp@ work_size_query;
 
         params->WORK = &work_size_query;
@@ -3911,7 +3907,6 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
 
     {
         /* compute optimal work size */
-        /* IS THE FOLLOWING METHOD CORRECT? */
         @ftyp@ work_size_query;
 
         params->WORK = &work_size_query;
@@ -3996,7 +3991,6 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
 
     {
         /* compute optimal work size */
-        /* IS THE FOLLOWING METHOD CORRECT? */
         @ftyp@ work_size_query;
 
         params->WORK = &work_size_query;

From 38fcfc226a621769bbf8154e9e789c1e7a5a77c8 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Wed, 9 Jun 2021 08:36:30 +0530
Subject: [PATCH 17/26] Take work_count as it is

---
 numpy/linalg/umath_linalg.c.src | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 47c2f04230ca..8cb385518b5d 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -3385,7 +3385,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
     }
 
     params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count > 0 ? work_count : -work_count);
+                                    work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
     mem_buff2 = malloc(work_size);
@@ -3466,7 +3466,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
     }
 
     params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count > 0 ? work_count : -work_count);
+                                    work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
 
@@ -3677,7 +3677,7 @@ init_@lapack_func@(GQR_PARAMS_t *params,
     }
 
     params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count > 0 ? work_count : -work_count);
+                                    work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
 
@@ -3761,7 +3761,7 @@ init_@lapack_func@(GQR_PARAMS_t *params,
     }
 
     params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count > 0 ? work_count : -work_count);
+                                    work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
 
@@ -3920,7 +3920,7 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
     }
 
     params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count > 0 ? work_count : -work_count);
+                                    work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
 
@@ -4004,7 +4004,7 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
     }
 
     params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count > 0 ? work_count : -work_count);
+                                    work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
     mem_buff2 = malloc(work_size);

From 03bfafc427f23f3335f41d541730df6e9cfc514c Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Wed, 9 Jun 2021 08:39:38 +0530
Subject: [PATCH 18/26] shifted qr code in umath_linalg.c.src above

---
 numpy/linalg/umath_linalg.c.src | 853 ++++++++++++++++----------------
 1 file changed, 426 insertions(+), 427 deletions(-)

diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 8cb385518b5d..3d74db210f6c 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -2868,170 +2868,133 @@ static void
 
 /**end repeat**/
 
-
 /* -------------------------------------------------------------------------- */
-                 /* least squares */
+                 /* qr (modes - r, raw) */
 
-typedef struct gelsd_params_struct
+typedef struct geqfr_params_struct
 {
     fortran_int M;
     fortran_int N;
-    fortran_int NRHS;
     void *A;
     fortran_int LDA;
-    void *B;
-    fortran_int LDB;
-    void *S;
-    void *RCOND;
-    fortran_int RANK;
+    void* TAU;
     void *WORK;
     fortran_int LWORK;
-    void *RWORK;
-    void *IWORK;
-} GELSD_PARAMS_t;
+} GEQRF_PARAMS_t;
 
 
 static inline void
-dump_gelsd_params(const char *name,
-                  GELSD_PARAMS_t *params)
+dump_geqrf_params(const char *name,
+                  GEQRF_PARAMS_t *params)
 {
     TRACE_TXT("\n%s:\n"\
 
               "%14s: %18p\n"\
               "%14s: %18p\n"\
               "%14s: %18p\n"\
-              "%14s: %18p\n"\
-              "%14s: %18p\n"\
-              "%14s: %18p\n"\
-
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
               "%14s: %18d\n"\
               "%14s: %18d\n"\
               "%14s: %18d\n"\
-
-              "%14s: %18p\n",
+              "%14s: %18d\n",
 
               name,
 
               "A", params->A,
-              "B", params->B,
-              "S", params->S,
+              "TAU", params->TAU,
               "WORK", params->WORK,
-              "RWORK", params->RWORK,
-              "IWORK", params->IWORK,
 
               "M", (int)params->M,
               "N", (int)params->N,
-              "NRHS", (int)params->NRHS,
               "LDA", (int)params->LDA,
-              "LDB", (int)params->LDB,
-              "LWORK", (int)params->LWORK,
-              "RANK", (int)params->RANK,
-
-              "RCOND", params->RCOND);
+              "LWORK", (int)params->LWORK);
 }
 
-
 /**begin repeat
-   #TYPE=FLOAT,DOUBLE#
-   #lapack_func=sgelsd,dgelsd#
-   #ftyp=fortran_real,fortran_doublereal#
+   #lapack_func=sgeqrf,dgeqrf,cgeqrf,zgeqrf#
  */
 
 static inline fortran_int
-call_@lapack_func@(GELSD_PARAMS_t *params)
+call_@lapack_func@(GEQRF_PARAMS_t *params)
 {
     fortran_int rv;
-    LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
+    LAPACK(@lapack_func@)(&params->M, &params->N,
                           params->A, &params->LDA,
-                          params->B, &params->LDB,
-                          params->S,
-                          params->RCOND, &params->RANK,
+                          params->TAU,
                           params->WORK, &params->LWORK,
-                          params->IWORK,
                           &rv);
     return rv;
 }
 
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE#
+   #lapack_func=sgeqrf,dgeqrf#
+   #ftyp=fortran_real,fortran_doublereal#
+ */
 static inline int
-init_@lapack_func@(GELSD_PARAMS_t *params,
+init_@lapack_func@(GEQRF_PARAMS_t *params,
                    fortran_int m,
-                   fortran_int n,
-                   fortran_int nrhs)
+                   fortran_int n)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
-    npy_uint8 *a, *b, *s, *work, *iwork;
+    npy_uint8 *a, *tau, *work;
     fortran_int min_m_n = fortran_int_min(m, n);
-    fortran_int max_m_n = fortran_int_max(m, n);
     size_t safe_min_m_n = min_m_n;
-    size_t safe_max_m_n = max_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
-    size_t safe_nrhs = nrhs;
 
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
-    size_t s_size = safe_min_m_n * sizeof(@ftyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
     fortran_int work_count;
     size_t work_size;
-    size_t iwork_size;
     fortran_int lda = fortran_int_max(1, m);
-    fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
 
-    mem_buff = malloc(a_size + b_size + s_size);
+    mem_buff = malloc(a_size + tau_size);
 
     if (!mem_buff)
         goto error;
 
     a = mem_buff;
-    b = a + a_size;
-    s = b + b_size;
+    tau = a + a_size;
+    memset(tau, 0, tau_size);
 
 
     params->M = m;
     params->N = n;
-    params->NRHS = nrhs;
     params->A = a;
-    params->B = b;
-    params->S = s;
+    params->TAU = tau;
     params->LDA = lda;
-    params->LDB = ldb;
 
     {
         /* compute optimal work size */
+
         @ftyp@ work_size_query;
-        fortran_int iwork_size_query;
 
         params->WORK = &work_size_query;
-        params->IWORK = &iwork_size_query;
-        params->RWORK = NULL;
         params->LWORK = -1;
 
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int)work_size_query;
+        work_count = (fortran_int) *(@ftyp@*) params->WORK;
 
-        work_size  = (size_t) work_size_query * sizeof(@ftyp@);
-        iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
     }
 
-    mem_buff2 = malloc(work_size + iwork_size);
+    params->LWORK = fortran_int_max(fortran_int_max(1, n),
+                                    work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+    mem_buff2 = malloc(work_size);
     if (!mem_buff2)
         goto error;
 
     work = mem_buff2;
-    iwork = work + work_size;
+    memset(work, 0, work_size);
 
     params->WORK = work;
-    params->RWORK = NULL;
-    params->IWORK = iwork;
-    params->LWORK = work_count;
 
     return 1;
  error:
@@ -3047,105 +3010,73 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
 
 /**begin repeat
    #TYPE=CFLOAT,CDOUBLE#
+   #lapack_func=cgeqrf,zgeqrf#
    #ftyp=fortran_complex,fortran_doublecomplex#
-   #frealtyp=fortran_real,fortran_doublereal#
-   #typ=COMPLEX_t,DOUBLECOMPLEX_t#
-   #lapack_func=cgelsd,zgelsd#
  */
-
-static inline fortran_int
-call_@lapack_func@(GELSD_PARAMS_t *params)
-{
-    fortran_int rv;
-    LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
-                          params->A, &params->LDA,
-                          params->B, &params->LDB,
-                          params->S,
-                          params->RCOND, &params->RANK,
-                          params->WORK, &params->LWORK,
-                          params->RWORK, params->IWORK,
-                          &rv);
-    return rv;
-}
-
 static inline int
-init_@lapack_func@(GELSD_PARAMS_t *params,
+init_@lapack_func@(GEQRF_PARAMS_t *params,
                    fortran_int m,
-                   fortran_int n,
-                   fortran_int nrhs)
+                   fortran_int n)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
-    npy_uint8 *a, *b, *s, *work, *iwork, *rwork;
+    npy_uint8 *a, *tau, *work;
     fortran_int min_m_n = fortran_int_min(m, n);
-    fortran_int max_m_n = fortran_int_max(m, n);
     size_t safe_min_m_n = min_m_n;
-    size_t safe_max_m_n = max_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
-    size_t safe_nrhs = nrhs;
 
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
-    size_t s_size = safe_min_m_n * sizeof(@frealtyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
     fortran_int work_count;
-    size_t work_size, rwork_size, iwork_size;
+    size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
-    fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
 
-    mem_buff = malloc(a_size + b_size + s_size);
+    mem_buff = malloc(a_size + tau_size);
 
     if (!mem_buff)
         goto error;
 
     a = mem_buff;
-    b = a + a_size;
-    s = b + b_size;
+    tau = a + a_size;
+    memset(tau, 0, tau_size);
 
 
     params->M = m;
     params->N = n;
-    params->NRHS = nrhs;
     params->A = a;
-    params->B = b;
-    params->S = s;
+    params->TAU = tau;
     params->LDA = lda;
-    params->LDB = ldb;
 
     {
         /* compute optimal work size */
+
         @ftyp@ work_size_query;
-        @frealtyp@ rwork_size_query;
-        fortran_int iwork_size_query;
 
         params->WORK = &work_size_query;
-        params->IWORK = &iwork_size_query;
-        params->RWORK = &rwork_size_query;
         params->LWORK = -1;
 
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int)work_size_query.r;
+        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
 
-        work_size  = (size_t )work_size_query.r * sizeof(@ftyp@);
-        rwork_size = (size_t)rwork_size_query * sizeof(@frealtyp@);
-        iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
     }
 
-    mem_buff2 = malloc(work_size + rwork_size + iwork_size);
+    params->LWORK = fortran_int_max(fortran_int_max(1, n),
+                                    work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+
+    mem_buff2 = malloc(work_size);
     if (!mem_buff2)
         goto error;
 
     work = mem_buff2;
-    rwork = work + work_size;
-    iwork = rwork + rwork_size;
+    memset(work, 0, work_size);
 
     params->WORK = work;
-    params->RWORK = rwork;
-    params->IWORK = iwork;
-    params->LWORK = work_count;
 
     return 1;
  error:
@@ -3159,20 +3090,11 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
 
 /**end repeat**/
 
-
 /**begin repeat
-   #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
-   #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
-   #lapack_func=sgelsd,dgelsd,cgelsd,zgelsd#
-   #dot_func=sdot,ddot,cdotc,zdotc#
-   #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
-   #basetyp = npy_float, npy_double, npy_float,  npy_double#
-   #ftyp = fortran_real, fortran_doublereal,
-           fortran_complex, fortran_doublecomplex#
-   #cmplx = 0, 0, 1, 1#
+   #lapack_func=sgeqrf,dgeqrf,cgeqrf,zgeqrf#
  */
 static inline void
-release_@lapack_func@(GELSD_PARAMS_t* params)
+release_@lapack_func@(GEQRF_PARAMS_t* params)
 {
     /* A and WORK contain allocated blocks */
     free(params->A);
@@ -3180,84 +3102,48 @@ release_@lapack_func@(GELSD_PARAMS_t* params)
     memset(params, 0, sizeof(*params));
 }
 
-/** Compute the squared l2 norm of a contiguous vector */
-static @basetyp@
-@TYPE@_abs2(@typ@ *p, npy_intp n) {
-    npy_intp i;
-    @basetyp@ res = 0;
-    for (i = 0; i < n; i++) {
-        @typ@ el = p[i];
-#if @cmplx@
-        res += el.real*el.real + el.imag*el.imag;
-#else
-        res += el*el;
-#endif
-    }
-    return res;
-}
+/**end repeat**/
 
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
+   #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
+   #lapack_func=sgeqrf,dgeqrf,cgeqrf,zgeqrf#
+   #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
+   #basetyp = npy_float, npy_double, npy_float,  npy_double#
+   #ftyp = fortran_real, fortran_doublereal,
+           fortran_complex, fortran_doublecomplex#
+   #cmplx = 0, 0, 1, 1#
+ */
 static void
-@TYPE@_lstsq(char **args, npy_intp const *dimensions, npy_intp const *steps,
-             void *NPY_UNUSED(func))
+@TYPE@_qr_r_raw(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func))
 {
-    GELSD_PARAMS_t params;
+    GEQRF_PARAMS_t params;
     int error_occurred = get_fp_invalid_and_clear();
-    fortran_int n, m, nrhs;
-    fortran_int excess;
+    fortran_int n, m;
 
-    INIT_OUTER_LOOP_7
+    INIT_OUTER_LOOP_2
 
     m = (fortran_int)dimensions[0];
     n = (fortran_int)dimensions[1];
-    nrhs = (fortran_int)dimensions[2];
-    excess = m - n;
 
-    if (init_@lapack_func@(&params, m, n, nrhs)) {
-        LINEARIZE_DATA_t a_in, b_in, x_out, s_out, r_out;
+    if (init_@lapack_func@(&params, m, n)) {
+        LINEARIZE_DATA_t a_in, tau_out;
 
         init_linearize_data(&a_in, n, m, steps[1], steps[0]);
-        init_linearize_data_ex(&b_in, nrhs, m, steps[3], steps[2], fortran_int_max(n, m));
-        init_linearize_data_ex(&x_out, nrhs, n, steps[5], steps[4], fortran_int_max(n, m));
-        init_linearize_data(&r_out, 1, nrhs, 1, steps[6]);
-        init_linearize_data(&s_out, 1, fortran_int_min(n, m), 1, steps[7]);
+        init_linearize_data(&tau_out, 1, fortran_int_min(m, n), 1, steps[2]);
 
-        BEGIN_OUTER_LOOP_7
+        BEGIN_OUTER_LOOP_2
             int not_ok;
             linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            linearize_@TYPE@_matrix(params.B, args[1], &b_in);
-            params.RCOND = args[2];
             not_ok = call_@lapack_func@(&params);
             if (!not_ok) {
-                delinearize_@TYPE@_matrix(args[3], params.B, &x_out);
-                *(npy_int*) args[5] = params.RANK;
-                delinearize_@REALTYPE@_matrix(args[6], params.S, &s_out);
-
-                /* Note that linalg.lstsq discards this when excess == 0 */
-                if (excess >= 0 && params.RANK == n) {
-                    /* Compute the residuals as the square sum of each column */
-                    int i;
-                    char *resid = args[4];
-                    @ftyp@ *components = (@ftyp@ *)params.B + n;
-                    for (i = 0; i < nrhs; i++) {
-                        @ftyp@ *vector = components + i*m;
-                        /* Numpy and fortran floating types are the same size,
-                         * so this cast is safe */
-                        @basetyp@ abs2 = @TYPE@_abs2((@typ@ *)vector, excess);
-                        memcpy(
-                            resid + i*r_out.column_strides,
-                            &abs2, sizeof(abs2));
-                    }
-                }
-                else {
-                    /* Note that this is always discarded by linalg.lstsq */
-                    nan_@REALTYPE@_matrix(args[4], &r_out);
-                }
+                delinearize_@TYPE@_matrix(args[0], params.A, &a_in);
+                delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_out);
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[3], &x_out);
-                nan_@REALTYPE@_matrix(args[4], &r_out);
-                *(npy_int*) args[5] = -1;
-                nan_@REALTYPE@_matrix(args[6], &s_out);
+                nan_@TYPE@_matrix(args[0], &a_in);
+                nan_@TYPE@_matrix(args[1], &tau_out);
             }
         END_OUTER_LOOP
 
@@ -3270,23 +3156,25 @@ static void
 /**end repeat**/
 
 /* -------------------------------------------------------------------------- */
-                 /* qr (modes - r, raw) */
+                 /* qr (modes - reduced) */
 
-typedef struct geqfr_params_struct
+typedef struct gqr_params_struct
 {
     fortran_int M;
-    fortran_int N;
-    void *A;
+    fortran_int MC;
+    fortran_int MN;
+    void* A;
+    void *Q;
     fortran_int LDA;
     void* TAU;
     void *WORK;
     fortran_int LWORK;
-} GEQRF_PARAMS_t;
+} GQR_PARAMS_t;
 
 
 static inline void
-dump_geqrf_params(const char *name,
-                  GEQRF_PARAMS_t *params)
+dump_gqr_params(const char *name,
+                GQR_PARAMS_t *params)
 {
     TRACE_TXT("\n%s:\n"\
 
@@ -3296,30 +3184,32 @@ dump_geqrf_params(const char *name,
               "%14s: %18d\n"\
               "%14s: %18d\n"\
               "%14s: %18d\n"\
+              "%14s: %18d\n"\
               "%14s: %18d\n",
 
               name,
 
-              "A", params->A,
+              "Q", params->Q,
               "TAU", params->TAU,
               "WORK", params->WORK,
 
               "M", (int)params->M,
-              "N", (int)params->N,
+              "MC", (int)params->MC,
+              "MN", (int)params->MN,
               "LDA", (int)params->LDA,
               "LWORK", (int)params->LWORK);
 }
 
 /**begin repeat
-   #lapack_func=sgeqrf,dgeqrf,cgeqrf,zgeqrf#
+   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
  */
 
 static inline fortran_int
-call_@lapack_func@(GEQRF_PARAMS_t *params)
+call_@lapack_func@(GQR_PARAMS_t *params)
 {
     fortran_int rv;
-    LAPACK(@lapack_func@)(&params->M, &params->N,
-                          params->A, &params->LDA,
+    LAPACK(@lapack_func@)(&params->M, &params->MC, &params->MN,
+                          params->Q, &params->LDA,
                           params->TAU,
                           params->WORK, &params->LWORK,
                           &rv);
@@ -3330,48 +3220,49 @@ call_@lapack_func@(GEQRF_PARAMS_t *params)
 
 /**begin repeat
    #TYPE=FLOAT,DOUBLE#
-   #lapack_func=sgeqrf,dgeqrf#
+   #lapack_func=sorgqr,dorgqr#
    #ftyp=fortran_real,fortran_doublereal#
  */
 static inline int
-init_@lapack_func@(GEQRF_PARAMS_t *params,
+init_@lapack_func@(GQR_PARAMS_t *params,
                    fortran_int m,
                    fortran_int n)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
-    npy_uint8 *a, *tau, *work;
+    npy_uint8 *a, *q, *tau, *work;
     fortran_int min_m_n = fortran_int_min(m, n);
     size_t safe_min_m_n = min_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
-
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t q_size = safe_m * safe_min_m_n * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
     fortran_int work_count;
     size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
 
-    mem_buff = malloc(a_size + tau_size);
+    mem_buff = malloc(q_size + tau_size + a_size);
 
     if (!mem_buff)
         goto error;
 
-    a = mem_buff;
-    tau = a + a_size;
-    memset(tau, 0, tau_size);
+    q = mem_buff;
+    tau = q + q_size;
+    a = tau + tau_size;
 
 
     params->M = m;
-    params->N = n;
+    params->MC = min_m_n;
+    params->MN = min_m_n;
     params->A = a;
+    params->Q = q;
     params->TAU = tau;
     params->LDA = lda;
 
     {
         /* compute optimal work size */
-
         @ftyp@ work_size_query;
 
         params->WORK = &work_size_query;
@@ -3388,6 +3279,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
                                     work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+
     mem_buff2 = malloc(work_size);
     if (!mem_buff2)
         goto error;
@@ -3411,48 +3303,50 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
 
 /**begin repeat
    #TYPE=CFLOAT,CDOUBLE#
-   #lapack_func=cgeqrf,zgeqrf#
+   #lapack_func=cungqr,zungqr#
    #ftyp=fortran_complex,fortran_doublecomplex#
  */
 static inline int
-init_@lapack_func@(GEQRF_PARAMS_t *params,
+init_@lapack_func@(GQR_PARAMS_t *params,
                    fortran_int m,
                    fortran_int n)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
-    npy_uint8 *a, *tau, *work;
+    npy_uint8 *a, *q, *tau, *work;
     fortran_int min_m_n = fortran_int_min(m, n);
     size_t safe_min_m_n = min_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
 
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t q_size = safe_m * safe_min_m_n * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
     fortran_int work_count;
     size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
 
-    mem_buff = malloc(a_size + tau_size);
+    mem_buff = malloc(q_size + tau_size + a_size);
 
     if (!mem_buff)
         goto error;
 
-    a = mem_buff;
-    tau = a + a_size;
-    memset(tau, 0, tau_size);
+    q = mem_buff;
+    tau = q + q_size;
+    a = tau + tau_size;
 
 
     params->M = m;
-    params->N = n;
+    params->MC = min_m_n;
+    params->MN = min_m_n;
     params->A = a;
+    params->Q = q;
     params->TAU = tau;
     params->LDA = lda;
 
     {
         /* compute optimal work size */
-
         @ftyp@ work_size_query;
 
         params->WORK = &work_size_query;
@@ -3478,6 +3372,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
     memset(work, 0, work_size);
 
     params->WORK = work;
+    params->LWORK = work_count;
 
     return 1;
  error:
@@ -3492,13 +3387,13 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
 /**end repeat**/
 
 /**begin repeat
-   #lapack_func=sgeqrf,dgeqrf,cgeqrf,zgeqrf#
+   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
  */
 static inline void
-release_@lapack_func@(GEQRF_PARAMS_t* params)
+release_@lapack_func@(GQR_PARAMS_t* params)
 {
     /* A and WORK contain allocated blocks */
-    free(params->A);
+    free(params->Q);
     free(params->WORK);
     memset(params, 0, sizeof(*params));
 }
@@ -3508,7 +3403,7 @@ release_@lapack_func@(GEQRF_PARAMS_t* params)
 /**begin repeat
    #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
    #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
-   #lapack_func=sgeqrf,dgeqrf,cgeqrf,zgeqrf#
+   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
    #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
    #basetyp = npy_float, npy_double, npy_float,  npy_double#
    #ftyp = fortran_real, fortran_doublereal,
@@ -3516,35 +3411,41 @@ release_@lapack_func@(GEQRF_PARAMS_t* params)
    #cmplx = 0, 0, 1, 1#
  */
 static void
-@TYPE@_qr_r_raw(char **args, npy_intp const *dimensions, npy_intp const *steps,
-          void *NPY_UNUSED(func))
+@TYPE@_qr_reduced(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
 {
-    GEQRF_PARAMS_t params;
+    GQR_PARAMS_t params;
     int error_occurred = get_fp_invalid_and_clear();
     fortran_int n, m;
+    size_t safe_min_m_n, safe_m;
 
-    INIT_OUTER_LOOP_2
+    INIT_OUTER_LOOP_3
 
     m = (fortran_int)dimensions[0];
     n = (fortran_int)dimensions[1];
+    safe_min_m_n = fortran_int_min(m, n);
+    safe_m = m;
 
     if (init_@lapack_func@(&params, m, n)) {
-        LINEARIZE_DATA_t a_in, tau_out;
+        LINEARIZE_DATA_t a_in, tau_in, q_out;
 
         init_linearize_data(&a_in, n, m, steps[1], steps[0]);
-        init_linearize_data(&tau_out, 1, fortran_int_min(m, n), 1, steps[2]);
+        init_linearize_data(&tau_in, 1, fortran_int_min(m, n), 1, steps[2]);
+        init_linearize_data(&q_out, fortran_int_min(m, n), m, steps[4], steps[3]);
 
-        BEGIN_OUTER_LOOP_2
+        BEGIN_OUTER_LOOP_3
             int not_ok;
             linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            memcpy(params.Q, params.A, safe_min_m_n * safe_m * sizeof(@ftyp@));
+            linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
             not_ok = call_@lapack_func@(&params);
             if (!not_ok) {
-                delinearize_@TYPE@_matrix(args[0], params.A, &a_in);
-                delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_out);
+                delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_in);
+                delinearize_@TYPE@_matrix(args[2], params.Q, &q_out);
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[0], &a_in);
-                nan_@TYPE@_matrix(args[1], &tau_out);
+                nan_@TYPE@_matrix(args[1], &tau_in);
+                nan_@TYPE@_matrix(args[2], &q_out);
             }
         END_OUTER_LOOP
 
@@ -3557,67 +3458,7 @@ static void
 /**end repeat**/
 
 /* -------------------------------------------------------------------------- */
-                 /* qr (modes - reduced) */
-
-typedef struct gqr_params_struct
-{
-    fortran_int M;
-    fortran_int MC;
-    fortran_int MN;
-    void* A;
-    void *Q;
-    fortran_int LDA;
-    void* TAU;
-    void *WORK;
-    fortran_int LWORK;
-} GQR_PARAMS_t;
-
-
-static inline void
-dump_gqr_params(const char *name,
-                GQR_PARAMS_t *params)
-{
-    TRACE_TXT("\n%s:\n"\
-
-              "%14s: %18p\n"\
-              "%14s: %18p\n"\
-              "%14s: %18p\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n",
-
-              name,
-
-              "Q", params->Q,
-              "TAU", params->TAU,
-              "WORK", params->WORK,
-
-              "M", (int)params->M,
-              "MC", (int)params->MC,
-              "MN", (int)params->MN,
-              "LDA", (int)params->LDA,
-              "LWORK", (int)params->LWORK);
-}
-
-/**begin repeat
-   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
- */
-
-static inline fortran_int
-call_@lapack_func@(GQR_PARAMS_t *params)
-{
-    fortran_int rv;
-    LAPACK(@lapack_func@)(&params->M, &params->MC, &params->MN,
-                          params->Q, &params->LDA,
-                          params->TAU,
-                          params->WORK, &params->LWORK,
-                          &rv);
-    return rv;
-}
-
-/**end repeat**/
+                 /* qr (modes - complete) */
 
 /**begin repeat
    #TYPE=FLOAT,DOUBLE#
@@ -3625,7 +3466,7 @@ call_@lapack_func@(GQR_PARAMS_t *params)
    #ftyp=fortran_real,fortran_doublereal#
  */
 static inline int
-init_@lapack_func@(GQR_PARAMS_t *params,
+init_@lapack_func@_complete(GQR_PARAMS_t *params,
                    fortran_int m,
                    fortran_int n)
 {
@@ -3636,8 +3477,9 @@ init_@lapack_func@(GQR_PARAMS_t *params,
     size_t safe_min_m_n = min_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
+
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t q_size = safe_m * safe_min_m_n * sizeof(@ftyp@);
+    size_t q_size = safe_m * safe_m * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
     fortran_int work_count;
@@ -3655,7 +3497,7 @@ init_@lapack_func@(GQR_PARAMS_t *params,
 
 
     params->M = m;
-    params->MC = min_m_n;
+    params->MC = m;
     params->MN = min_m_n;
     params->A = a;
     params->Q = q;
@@ -3708,9 +3550,9 @@ init_@lapack_func@(GQR_PARAMS_t *params,
    #ftyp=fortran_complex,fortran_doublecomplex#
  */
 static inline int
-init_@lapack_func@(GQR_PARAMS_t *params,
-                   fortran_int m,
-                   fortran_int n)
+init_@lapack_func@_complete(GQR_PARAMS_t *params,
+                            fortran_int m,
+                            fortran_int n)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
@@ -3721,7 +3563,7 @@ init_@lapack_func@(GQR_PARAMS_t *params,
     size_t safe_n = n;
 
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t q_size = safe_m * safe_min_m_n * sizeof(@ftyp@);
+    size_t q_size = safe_m * safe_m * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
     fortran_int work_count;
@@ -3739,7 +3581,7 @@ init_@lapack_func@(GQR_PARAMS_t *params,
 
 
     params->M = m;
-    params->MC = min_m_n;
+    params->MC = m;
     params->MN = min_m_n;
     params->A = a;
     params->Q = q;
@@ -3764,7 +3606,6 @@ init_@lapack_func@(GQR_PARAMS_t *params,
                                     work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
-
     mem_buff2 = malloc(work_size);
     if (!mem_buff2)
         goto error;
@@ -3787,20 +3628,6 @@ init_@lapack_func@(GQR_PARAMS_t *params,
 
 /**end repeat**/
 
-/**begin repeat
-   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
- */
-static inline void
-release_@lapack_func@(GQR_PARAMS_t* params)
-{
-    /* A and WORK contain allocated blocks */
-    free(params->Q);
-    free(params->WORK);
-    memset(params, 0, sizeof(*params));
-}
-
-/**end repeat**/
-
 /**begin repeat
    #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
    #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
@@ -3812,32 +3639,33 @@ release_@lapack_func@(GQR_PARAMS_t* params)
    #cmplx = 0, 0, 1, 1#
  */
 static void
-@TYPE@_qr_reduced(char **args, npy_intp const *dimensions, npy_intp const *steps,
+@TYPE@_qr_complete(char **args, npy_intp const *dimensions, npy_intp const *steps,
                   void *NPY_UNUSED(func))
 {
     GQR_PARAMS_t params;
     int error_occurred = get_fp_invalid_and_clear();
     fortran_int n, m;
-    size_t safe_min_m_n, safe_m;
+    size_t safe_n, safe_m;
 
     INIT_OUTER_LOOP_3
 
     m = (fortran_int)dimensions[0];
     n = (fortran_int)dimensions[1];
-    safe_min_m_n = fortran_int_min(m, n);
+    safe_n = n;
     safe_m = m;
 
-    if (init_@lapack_func@(&params, m, n)) {
+
+    if (init_@lapack_func@_complete(&params, m, n)) {
         LINEARIZE_DATA_t a_in, tau_in, q_out;
 
         init_linearize_data(&a_in, n, m, steps[1], steps[0]);
         init_linearize_data(&tau_in, 1, fortran_int_min(m, n), 1, steps[2]);
-        init_linearize_data(&q_out, fortran_int_min(m, n), m, steps[4], steps[3]);
+        init_linearize_data(&q_out, m, m, steps[4], steps[3]);
 
         BEGIN_OUTER_LOOP_3
             int not_ok;
             linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            memcpy(params.Q, params.A, safe_min_m_n * safe_m * sizeof(@ftyp@));
+            memcpy(params.Q, params.A, safe_n * safe_m * sizeof(@ftyp@));
             linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
             not_ok = call_@lapack_func@(&params);
             if (!not_ok) {
@@ -3859,79 +3687,168 @@ static void
 /**end repeat**/
 
 /* -------------------------------------------------------------------------- */
-                 /* qr (modes - complete) */
+                 /* least squares */
+
+typedef struct gelsd_params_struct
+{
+    fortran_int M;
+    fortran_int N;
+    fortran_int NRHS;
+    void *A;
+    fortran_int LDA;
+    void *B;
+    fortran_int LDB;
+    void *S;
+    void *RCOND;
+    fortran_int RANK;
+    void *WORK;
+    fortran_int LWORK;
+    void *RWORK;
+    void *IWORK;
+} GELSD_PARAMS_t;
+
+
+static inline void
+dump_gelsd_params(const char *name,
+                  GELSD_PARAMS_t *params)
+{
+    TRACE_TXT("\n%s:\n"\
+
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+
+              "%14s: %18p\n",
+
+              name,
+
+              "A", params->A,
+              "B", params->B,
+              "S", params->S,
+              "WORK", params->WORK,
+              "RWORK", params->RWORK,
+              "IWORK", params->IWORK,
+
+              "M", (int)params->M,
+              "N", (int)params->N,
+              "NRHS", (int)params->NRHS,
+              "LDA", (int)params->LDA,
+              "LDB", (int)params->LDB,
+              "LWORK", (int)params->LWORK,
+              "RANK", (int)params->RANK,
+
+              "RCOND", params->RCOND);
+}
+
 
 /**begin repeat
    #TYPE=FLOAT,DOUBLE#
-   #lapack_func=sorgqr,dorgqr#
+   #lapack_func=sgelsd,dgelsd#
    #ftyp=fortran_real,fortran_doublereal#
  */
+
+static inline fortran_int
+call_@lapack_func@(GELSD_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
+                          params->A, &params->LDA,
+                          params->B, &params->LDB,
+                          params->S,
+                          params->RCOND, &params->RANK,
+                          params->WORK, &params->LWORK,
+                          params->IWORK,
+                          &rv);
+    return rv;
+}
+
 static inline int
-init_@lapack_func@_complete(GQR_PARAMS_t *params,
+init_@lapack_func@(GELSD_PARAMS_t *params,
                    fortran_int m,
-                   fortran_int n)
+                   fortran_int n,
+                   fortran_int nrhs)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
-    npy_uint8 *a, *q, *tau, *work;
+    npy_uint8 *a, *b, *s, *work, *iwork;
     fortran_int min_m_n = fortran_int_min(m, n);
+    fortran_int max_m_n = fortran_int_max(m, n);
     size_t safe_min_m_n = min_m_n;
+    size_t safe_max_m_n = max_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
+    size_t safe_nrhs = nrhs;
 
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t q_size = safe_m * safe_m * sizeof(@ftyp@);
-    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
+    size_t s_size = safe_min_m_n * sizeof(@ftyp@);
 
     fortran_int work_count;
     size_t work_size;
+    size_t iwork_size;
     fortran_int lda = fortran_int_max(1, m);
+    fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
 
-    mem_buff = malloc(q_size + tau_size + a_size);
+    mem_buff = malloc(a_size + b_size + s_size);
 
     if (!mem_buff)
         goto error;
 
-    q = mem_buff;
-    tau = q + q_size;
-    a = tau + tau_size;
-
+    a = mem_buff;
+    b = a + a_size;
+    s = b + b_size;
+
 
     params->M = m;
-    params->MC = m;
-    params->MN = min_m_n;
+    params->N = n;
+    params->NRHS = nrhs;
     params->A = a;
-    params->Q = q;
-    params->TAU = tau;
+    params->B = b;
+    params->S = s;
     params->LDA = lda;
+    params->LDB = ldb;
 
     {
         /* compute optimal work size */
         @ftyp@ work_size_query;
+        fortran_int iwork_size_query;
 
         params->WORK = &work_size_query;
+        params->IWORK = &iwork_size_query;
+        params->RWORK = NULL;
         params->LWORK = -1;
 
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int) *(@ftyp@*) params->WORK;
+        work_count = (fortran_int)work_size_query;
 
+        work_size  = (size_t) work_size_query * sizeof(@ftyp@);
+        iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
     }
 
-    params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count);
-
-    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
-
-    mem_buff2 = malloc(work_size);
+    mem_buff2 = malloc(work_size + iwork_size);
     if (!mem_buff2)
         goto error;
 
     work = mem_buff2;
-    memset(work, 0, work_size);
+    iwork = work + work_size;
 
     params->WORK = work;
+    params->RWORK = NULL;
+    params->IWORK = iwork;
+    params->LWORK = work_count;
 
     return 1;
  error:
@@ -3947,74 +3864,104 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
 
 /**begin repeat
    #TYPE=CFLOAT,CDOUBLE#
-   #lapack_func=cungqr,zungqr#
    #ftyp=fortran_complex,fortran_doublecomplex#
+   #frealtyp=fortran_real,fortran_doublereal#
+   #typ=COMPLEX_t,DOUBLECOMPLEX_t#
+   #lapack_func=cgelsd,zgelsd#
  */
+
+static inline fortran_int
+call_@lapack_func@(GELSD_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
+                          params->A, &params->LDA,
+                          params->B, &params->LDB,
+                          params->S,
+                          params->RCOND, &params->RANK,
+                          params->WORK, &params->LWORK,
+                          params->RWORK, params->IWORK,
+                          &rv);
+    return rv;
+}
+
 static inline int
-init_@lapack_func@_complete(GQR_PARAMS_t *params,
-                            fortran_int m,
-                            fortran_int n)
+init_@lapack_func@(GELSD_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n,
+                   fortran_int nrhs)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
-    npy_uint8 *a, *q, *tau, *work;
+    npy_uint8 *a, *b, *s, *work, *iwork, *rwork;
     fortran_int min_m_n = fortran_int_min(m, n);
+    fortran_int max_m_n = fortran_int_max(m, n);
     size_t safe_min_m_n = min_m_n;
+    size_t safe_max_m_n = max_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
+    size_t safe_nrhs = nrhs;
 
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t q_size = safe_m * safe_m * sizeof(@ftyp@);
-    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
+    size_t s_size = safe_min_m_n * sizeof(@frealtyp@);
 
     fortran_int work_count;
-    size_t work_size;
+    size_t work_size, rwork_size, iwork_size;
     fortran_int lda = fortran_int_max(1, m);
+    fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
 
-    mem_buff = malloc(q_size + tau_size + a_size);
+    mem_buff = malloc(a_size + b_size + s_size);
 
     if (!mem_buff)
         goto error;
 
-    q = mem_buff;
-    tau = q + q_size;
-    a = tau + tau_size;
+    a = mem_buff;
+    b = a + a_size;
+    s = b + b_size;
 
 
     params->M = m;
-    params->MC = m;
-    params->MN = min_m_n;
+    params->N = n;
+    params->NRHS = nrhs;
     params->A = a;
-    params->Q = q;
-    params->TAU = tau;
+    params->B = b;
+    params->S = s;
     params->LDA = lda;
+    params->LDB = ldb;
 
     {
         /* compute optimal work size */
         @ftyp@ work_size_query;
+        @frealtyp@ rwork_size_query;
+        fortran_int iwork_size_query;
 
         params->WORK = &work_size_query;
+        params->IWORK = &iwork_size_query;
+        params->RWORK = &rwork_size_query;
         params->LWORK = -1;
 
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
+        work_count = (fortran_int)work_size_query.r;
 
+        work_size  = (size_t )work_size_query.r * sizeof(@ftyp@);
+        rwork_size = (size_t)rwork_size_query * sizeof(@frealtyp@);
+        iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
     }
 
-    params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count);
-
-    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
-    mem_buff2 = malloc(work_size);
+    mem_buff2 = malloc(work_size + rwork_size + iwork_size);
     if (!mem_buff2)
         goto error;
 
     work = mem_buff2;
-    memset(work, 0, work_size);
+    rwork = work + work_size;
+    iwork = rwork + rwork_size;
 
     params->WORK = work;
+    params->RWORK = rwork;
+    params->IWORK = iwork;
     params->LWORK = work_count;
 
     return 1;
@@ -4029,53 +3976,105 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
 
 /**end repeat**/
 
+
 /**begin repeat
    #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
    #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
-   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
+   #lapack_func=sgelsd,dgelsd,cgelsd,zgelsd#
+   #dot_func=sdot,ddot,cdotc,zdotc#
    #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
    #basetyp = npy_float, npy_double, npy_float,  npy_double#
    #ftyp = fortran_real, fortran_doublereal,
            fortran_complex, fortran_doublecomplex#
    #cmplx = 0, 0, 1, 1#
  */
+static inline void
+release_@lapack_func@(GELSD_PARAMS_t* params)
+{
+    /* A and WORK contain allocated blocks */
+    free(params->A);
+    free(params->WORK);
+    memset(params, 0, sizeof(*params));
+}
+
+/** Compute the squared l2 norm of a contiguous vector */
+static @basetyp@
+@TYPE@_abs2(@typ@ *p, npy_intp n) {
+    npy_intp i;
+    @basetyp@ res = 0;
+    for (i = 0; i < n; i++) {
+        @typ@ el = p[i];
+#if @cmplx@
+        res += el.real*el.real + el.imag*el.imag;
+#else
+        res += el*el;
+#endif
+    }
+    return res;
+}
+
 static void
-@TYPE@_qr_complete(char **args, npy_intp const *dimensions, npy_intp const *steps,
-                  void *NPY_UNUSED(func))
+@TYPE@_lstsq(char **args, npy_intp const *dimensions, npy_intp const *steps,
+             void *NPY_UNUSED(func))
 {
-    GQR_PARAMS_t params;
+    GELSD_PARAMS_t params;
     int error_occurred = get_fp_invalid_and_clear();
-    fortran_int n, m;
-    size_t safe_n, safe_m;
+    fortran_int n, m, nrhs;
+    fortran_int excess;
 
-    INIT_OUTER_LOOP_3
+    INIT_OUTER_LOOP_7
 
     m = (fortran_int)dimensions[0];
     n = (fortran_int)dimensions[1];
-    safe_n = n;
-    safe_m = m;
-
+    nrhs = (fortran_int)dimensions[2];
+    excess = m - n;
 
-    if (init_@lapack_func@_complete(&params, m, n)) {
-        LINEARIZE_DATA_t a_in, tau_in, q_out;
+    if (init_@lapack_func@(&params, m, n, nrhs)) {
+        LINEARIZE_DATA_t a_in, b_in, x_out, s_out, r_out;
 
         init_linearize_data(&a_in, n, m, steps[1], steps[0]);
-        init_linearize_data(&tau_in, 1, fortran_int_min(m, n), 1, steps[2]);
-        init_linearize_data(&q_out, m, m, steps[4], steps[3]);
+        init_linearize_data_ex(&b_in, nrhs, m, steps[3], steps[2], fortran_int_max(n, m));
+        init_linearize_data_ex(&x_out, nrhs, n, steps[5], steps[4], fortran_int_max(n, m));
+        init_linearize_data(&r_out, 1, nrhs, 1, steps[6]);
+        init_linearize_data(&s_out, 1, fortran_int_min(n, m), 1, steps[7]);
 
-        BEGIN_OUTER_LOOP_3
+        BEGIN_OUTER_LOOP_7
             int not_ok;
             linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            memcpy(params.Q, params.A, safe_n * safe_m * sizeof(@ftyp@));
-            linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
+            linearize_@TYPE@_matrix(params.B, args[1], &b_in);
+            params.RCOND = args[2];
             not_ok = call_@lapack_func@(&params);
             if (!not_ok) {
-                delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_in);
-                delinearize_@TYPE@_matrix(args[2], params.Q, &q_out);
+                delinearize_@TYPE@_matrix(args[3], params.B, &x_out);
+                *(npy_int*) args[5] = params.RANK;
+                delinearize_@REALTYPE@_matrix(args[6], params.S, &s_out);
+
+                /* Note that linalg.lstsq discards this when excess == 0 */
+                if (excess >= 0 && params.RANK == n) {
+                    /* Compute the residuals as the square sum of each column */
+                    int i;
+                    char *resid = args[4];
+                    @ftyp@ *components = (@ftyp@ *)params.B + n;
+                    for (i = 0; i < nrhs; i++) {
+                        @ftyp@ *vector = components + i*m;
+                        /* Numpy and fortran floating types are the same size,
+                         * so this cast is safe */
+                        @basetyp@ abs2 = @TYPE@_abs2((@typ@ *)vector, excess);
+                        memcpy(
+                            resid + i*r_out.column_strides,
+                            &abs2, sizeof(abs2));
+                    }
+                }
+                else {
+                    /* Note that this is always discarded by linalg.lstsq */
+                    nan_@REALTYPE@_matrix(args[4], &r_out);
+                }
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[1], &tau_in);
-                nan_@TYPE@_matrix(args[2], &q_out);
+                nan_@TYPE@_matrix(args[3], &x_out);
+                nan_@REALTYPE@_matrix(args[4], &r_out);
+                *(npy_int*) args[5] = -1;
+                nan_@REALTYPE@_matrix(args[6], &s_out);
             }
         END_OUTER_LOOP
 
@@ -4157,10 +4156,10 @@ GUFUNC_FUNC_ARRAY_REAL_COMPLEX(cholesky_lo);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_N);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_S);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(lstsq);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(qr_r_raw);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(qr_reduced);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(qr_complete);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(lstsq);
 GUFUNC_FUNC_ARRAY_EIG(eig);
 GUFUNC_FUNC_ARRAY_EIG(eigvals);
 
@@ -4226,14 +4225,6 @@ static char svd_1_3_types[] = {
     NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE
 };
 
-/*  A,           b,           rcond,      x,           resid,      rank,    s,        */
-static char lstsq_types[] = {
-    NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,  NPY_FLOAT,   NPY_FLOAT,  NPY_INT, NPY_FLOAT,
-    NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE, NPY_DOUBLE,  NPY_DOUBLE, NPY_INT, NPY_DOUBLE,
-    NPY_CFLOAT,  NPY_CFLOAT,  NPY_FLOAT,  NPY_CFLOAT,  NPY_FLOAT,  NPY_INT, NPY_FLOAT,
-    NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_INT, NPY_DOUBLE,
-};
-
 /* A, tau */
 static char qr_r_raw_types[] = {
     NPY_FLOAT,   NPY_FLOAT,
@@ -4258,6 +4249,14 @@ static char qr_complete_types[] = {
     NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
 };
 
+/*  A,           b,           rcond,      x,           resid,      rank,    s,        */
+static char lstsq_types[] = {
+    NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,  NPY_FLOAT,   NPY_FLOAT,  NPY_INT, NPY_FLOAT,
+    NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE, NPY_DOUBLE,  NPY_DOUBLE, NPY_INT, NPY_DOUBLE,
+    NPY_CFLOAT,  NPY_CFLOAT,  NPY_FLOAT,  NPY_CFLOAT,  NPY_FLOAT,  NPY_INT, NPY_FLOAT,
+    NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_INT, NPY_DOUBLE,
+};
+
 typedef struct gufunc_descriptor_struct {
     char *name;
     char *signature;
@@ -4449,24 +4448,6 @@ GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
         FUNC_ARRAY_NAME(eigvals),
         eigvals_types
     },
-    {
-        "lstsq_m",
-        "(m,n),(m,nrhs),()->(n,nrhs),(nrhs),(),(m)",
-        "least squares on the last two dimensions and broadcast to the rest. \n"\
-        "For m <= n. \n",
-        4, 3, 4,
-        FUNC_ARRAY_NAME(lstsq),
-        lstsq_types
-    },
-    {
-        "lstsq_n",
-        "(m,n),(m,nrhs),()->(n,nrhs),(nrhs),(),(n)",
-        "least squares on the last two dimensions and broadcast to the rest. \n"\
-        "For m >= n, meaning that residuals are produced. \n",
-        4, 3, 4,
-        FUNC_ARRAY_NAME(lstsq),
-        lstsq_types
-    },
     {
         "qr_r_raw_m",
         "(m,n)->(m)",
@@ -4520,6 +4501,24 @@ GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
         4, 2, 1,
         FUNC_ARRAY_NAME(qr_complete),
         qr_complete_types
+    },
+    {
+        "lstsq_m",
+        "(m,n),(m,nrhs),()->(n,nrhs),(nrhs),(),(m)",
+        "least squares on the last two dimensions and broadcast to the rest. \n"\
+        "For m <= n. \n",
+        4, 3, 4,
+        FUNC_ARRAY_NAME(lstsq),
+        lstsq_types
+    },
+    {
+        "lstsq_n",
+        "(m,n),(m,nrhs),()->(n,nrhs),(nrhs),(),(n)",
+        "least squares on the last two dimensions and broadcast to the rest. \n"\
+        "For m >= n, meaning that residuals are produced. \n",
+        4, 3, 4,
+        FUNC_ARRAY_NAME(lstsq),
+        lstsq_types
     }
 };
 

From 3f1c88c13a28c9b54f40d1b474efc2c3aee49c8c Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Wed, 9 Jun 2021 10:02:50 +0530
Subject: [PATCH 19/26] updated method for finding optimal work_count

---
 numpy/linalg/umath_linalg.c.src | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 3d74db210f6c..070d419298ca 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -2984,7 +2984,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
     }
 
     params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count);
+                                    work_count > 0 ? work_count : -work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
     mem_buff2 = malloc(work_size);
@@ -3029,7 +3029,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
-    fortran_int work_count;
+    fortran_int work_count_r, work_count_i, work_count;
     size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
 
@@ -3060,7 +3060,11 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
+        work_count_r = (fortran_int) ((@ftyp@*)params->WORK)->r;
+        work_count_i = (fortran_int) ((@ftyp@*)params->WORK)->i;
+
+        work_count = (fortran_int) sqrt((fortran_doublereal) work_count_r*work_count_r + 
+                                        (fortran_doublereal) work_count_i*work_count_i);
 
     }
 
@@ -3276,7 +3280,7 @@ init_@lapack_func@(GQR_PARAMS_t *params,
     }
 
     params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count);
+                                    work_count > 0 ? work_count : -work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
 
@@ -3323,7 +3327,7 @@ init_@lapack_func@(GQR_PARAMS_t *params,
     size_t q_size = safe_m * safe_min_m_n * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
-    fortran_int work_count;
+    fortran_int work_count_r, work_count_i, work_count;
     size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
 
@@ -3355,7 +3359,11 @@ init_@lapack_func@(GQR_PARAMS_t *params,
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
+        work_count_r = (fortran_int) ((@ftyp@*)params->WORK)->r;
+        work_count_i = (fortran_int) ((@ftyp@*)params->WORK)->i;
+
+        work_count = (fortran_int) sqrt((fortran_doublereal) work_count_r*work_count_r + 
+                                        (fortran_doublereal) work_count_i*work_count_i);
 
     }
 
@@ -3519,7 +3527,7 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
     }
 
     params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count);
+                                    work_count > 0 ? work_count : -work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
 
@@ -3566,7 +3574,7 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
     size_t q_size = safe_m * safe_m * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
-    fortran_int work_count;
+    fortran_int work_count_r, work_count_i, work_count;
     size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
 
@@ -3598,7 +3606,11 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
+        work_count_r = (fortran_int) ((@ftyp@*)params->WORK)->r;
+        work_count_i = (fortran_int) ((@ftyp@*)params->WORK)->i;
+
+        work_count = (fortran_int) sqrt((fortran_doublereal) work_count_r*work_count_r + 
+                                        (fortran_doublereal) work_count_i*work_count_i);
 
     }
 

From e9df9fdc0a2d26b53b881d64b492454cbdeed3b4 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Tue, 22 Jun 2021 09:56:49 +0530
Subject: [PATCH 20/26] factored out common init parts for sorgqr, dorgqr

---
 numpy/linalg/umath_linalg.c.src | 166 +++++++++++---------------------
 1 file changed, 57 insertions(+), 109 deletions(-)

diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 070d419298ca..86946274f5e2 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -3159,9 +3159,6 @@ static void
 
 /**end repeat**/
 
-/* -------------------------------------------------------------------------- */
-                 /* qr (modes - reduced) */
-
 typedef struct gqr_params_struct
 {
     fortran_int M;
@@ -3175,35 +3172,6 @@ typedef struct gqr_params_struct
     fortran_int LWORK;
 } GQR_PARAMS_t;
 
-
-static inline void
-dump_gqr_params(const char *name,
-                GQR_PARAMS_t *params)
-{
-    TRACE_TXT("\n%s:\n"\
-
-              "%14s: %18p\n"\
-              "%14s: %18p\n"\
-              "%14s: %18p\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n",
-
-              name,
-
-              "Q", params->Q,
-              "TAU", params->TAU,
-              "WORK", params->WORK,
-
-              "M", (int)params->M,
-              "MC", (int)params->MC,
-              "MN", (int)params->MN,
-              "LDA", (int)params->LDA,
-              "LWORK", (int)params->LWORK);
-}
-
 /**begin repeat
    #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
  */
@@ -3228,19 +3196,21 @@ call_@lapack_func@(GQR_PARAMS_t *params)
    #ftyp=fortran_real,fortran_doublereal#
  */
 static inline int
-init_@lapack_func@(GQR_PARAMS_t *params,
-                   fortran_int m,
-                   fortran_int n)
+init_@lapack_func@_common(GQR_PARAMS_t *params,
+                          fortran_int m,
+                          fortran_int n,
+                          fortran_int mc)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     npy_uint8 *a, *q, *tau, *work;
     fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_mc = mc;
     size_t safe_min_m_n = min_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t q_size = safe_m * safe_min_m_n * sizeof(@ftyp@);
+    size_t q_size = safe_m * safe_mc * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
     fortran_int work_count;
@@ -3258,7 +3228,7 @@ init_@lapack_func@(GQR_PARAMS_t *params,
 
 
     params->M = m;
-    params->MC = min_m_n;
+    params->MC = mc;
     params->MN = min_m_n;
     params->A = a;
     params->Q = q;
@@ -3305,6 +3275,53 @@ init_@lapack_func@(GQR_PARAMS_t *params,
 
 /**end repeat**/
 
+/* -------------------------------------------------------------------------- */
+                 /* qr (modes - reduced) */
+
+
+static inline void
+dump_gqr_params(const char *name,
+                GQR_PARAMS_t *params)
+{
+    TRACE_TXT("\n%s:\n"\
+
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n",
+
+              name,
+
+              "Q", params->Q,
+              "TAU", params->TAU,
+              "WORK", params->WORK,
+
+              "M", (int)params->M,
+              "MC", (int)params->MC,
+              "MN", (int)params->MN,
+              "LDA", (int)params->LDA,
+              "LWORK", (int)params->LWORK);
+}
+
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE#
+   #lapack_func=sorgqr,dorgqr#
+   #ftyp=fortran_real,fortran_doublereal#
+ */
+static inline int
+init_@lapack_func@(GQR_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n)
+{
+    return init_@lapack_func@_common(params, m, n, fortran_int_min(m, n));
+}
+
+/**end repeat**/
+
 /**begin repeat
    #TYPE=CFLOAT,CDOUBLE#
    #lapack_func=cungqr,zungqr#
@@ -3475,79 +3492,10 @@ static void
  */
 static inline int
 init_@lapack_func@_complete(GQR_PARAMS_t *params,
-                   fortran_int m,
-                   fortran_int n)
+                            fortran_int m,
+                            fortran_int n)
 {
-    npy_uint8 *mem_buff = NULL;
-    npy_uint8 *mem_buff2 = NULL;
-    npy_uint8 *a, *q, *tau, *work;
-    fortran_int min_m_n = fortran_int_min(m, n);
-    size_t safe_min_m_n = min_m_n;
-    size_t safe_m = m;
-    size_t safe_n = n;
-
-    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t q_size = safe_m * safe_m * sizeof(@ftyp@);
-    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
-
-    fortran_int work_count;
-    size_t work_size;
-    fortran_int lda = fortran_int_max(1, m);
-
-    mem_buff = malloc(q_size + tau_size + a_size);
-
-    if (!mem_buff)
-        goto error;
-
-    q = mem_buff;
-    tau = q + q_size;
-    a = tau + tau_size;
-
-
-    params->M = m;
-    params->MC = m;
-    params->MN = min_m_n;
-    params->A = a;
-    params->Q = q;
-    params->TAU = tau;
-    params->LDA = lda;
-
-    {
-        /* compute optimal work size */
-        @ftyp@ work_size_query;
-
-        params->WORK = &work_size_query;
-        params->LWORK = -1;
-
-        if (call_@lapack_func@(params) != 0)
-            goto error;
-
-        work_count = (fortran_int) *(@ftyp@*) params->WORK;
-
-    }
-
-    params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count > 0 ? work_count : -work_count);
-
-    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
-
-    mem_buff2 = malloc(work_size);
-    if (!mem_buff2)
-        goto error;
-
-    work = mem_buff2;
-    memset(work, 0, work_size);
-
-    params->WORK = work;
-
-    return 1;
- error:
-    TRACE_TXT("%s failed init\n", __FUNCTION__);
-    free(mem_buff);
-    free(mem_buff2);
-    memset(params, 0, sizeof(*params));
-
-    return 0;
+    return init_@lapack_func@_common(params, m, n, m);
 }
 
 /**end repeat**/

From b5d469efbfa4a400fd9213cb0a5c11c826be9da3 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Tue, 22 Jun 2021 10:05:47 +0530
Subject: [PATCH 21/26] refactoring complete

---
 numpy/linalg/umath_linalg.c.src | 205 ++++++++++----------------------
 1 file changed, 60 insertions(+), 145 deletions(-)

diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 86946274f5e2..964c4d481d73 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -3159,6 +3159,9 @@ static void
 
 /**end repeat**/
 
+/* -------------------------------------------------------------------------- */
+                 /* qr common code (modes - reduced and complete) */ 
+
 typedef struct gqr_params_struct
 {
     fortran_int M;
@@ -3191,7 +3194,6 @@ call_@lapack_func@(GQR_PARAMS_t *params)
 /**end repeat**/
 
 /**begin repeat
-   #TYPE=FLOAT,DOUBLE#
    #lapack_func=sorgqr,dorgqr#
    #ftyp=fortran_real,fortran_doublereal#
  */
@@ -3275,73 +3277,27 @@ init_@lapack_func@_common(GQR_PARAMS_t *params,
 
 /**end repeat**/
 
-/* -------------------------------------------------------------------------- */
-                 /* qr (modes - reduced) */
-
-
-static inline void
-dump_gqr_params(const char *name,
-                GQR_PARAMS_t *params)
-{
-    TRACE_TXT("\n%s:\n"\
-
-              "%14s: %18p\n"\
-              "%14s: %18p\n"\
-              "%14s: %18p\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n",
-
-              name,
-
-              "Q", params->Q,
-              "TAU", params->TAU,
-              "WORK", params->WORK,
-
-              "M", (int)params->M,
-              "MC", (int)params->MC,
-              "MN", (int)params->MN,
-              "LDA", (int)params->LDA,
-              "LWORK", (int)params->LWORK);
-}
-
-/**begin repeat
-   #TYPE=FLOAT,DOUBLE#
-   #lapack_func=sorgqr,dorgqr#
-   #ftyp=fortran_real,fortran_doublereal#
- */
-static inline int
-init_@lapack_func@(GQR_PARAMS_t *params,
-                   fortran_int m,
-                   fortran_int n)
-{
-    return init_@lapack_func@_common(params, m, n, fortran_int_min(m, n));
-}
-
-/**end repeat**/
-
 /**begin repeat
-   #TYPE=CFLOAT,CDOUBLE#
    #lapack_func=cungqr,zungqr#
    #ftyp=fortran_complex,fortran_doublecomplex#
  */
 static inline int
-init_@lapack_func@(GQR_PARAMS_t *params,
-                   fortran_int m,
-                   fortran_int n)
+init_@lapack_func@_common(GQR_PARAMS_t *params,
+                          fortran_int m,
+                          fortran_int n,
+                          fortran_int mc)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     npy_uint8 *a, *q, *tau, *work;
     fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_mc = mc;
     size_t safe_min_m_n = min_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
 
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t q_size = safe_m * safe_min_m_n * sizeof(@ftyp@);
+    size_t q_size = safe_m * safe_mc * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
     fortran_int work_count_r, work_count_i, work_count;
@@ -3359,7 +3315,7 @@ init_@lapack_func@(GQR_PARAMS_t *params,
 
 
     params->M = m;
-    params->MC = min_m_n;
+    params->MC = mc;
     params->MN = min_m_n;
     params->A = a;
     params->Q = q;
@@ -3411,6 +3367,54 @@ init_@lapack_func@(GQR_PARAMS_t *params,
 
 /**end repeat**/
 
+/* -------------------------------------------------------------------------- */
+                 /* qr (modes - reduced) */
+
+
+static inline void
+dump_gqr_params(const char *name,
+                GQR_PARAMS_t *params)
+{
+    TRACE_TXT("\n%s:\n"\
+
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n",
+
+              name,
+
+              "Q", params->Q,
+              "TAU", params->TAU,
+              "WORK", params->WORK,
+
+              "M", (int)params->M,
+              "MC", (int)params->MC,
+              "MN", (int)params->MN,
+              "LDA", (int)params->LDA,
+              "LWORK", (int)params->LWORK);
+}
+
+/**begin repeat
+   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
+   #ftyp=fortran_real,fortran_doublereal,fortran_complex,fortran_doublecomplex#
+ */
+static inline int
+init_@lapack_func@(GQR_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n)
+{
+    return init_@lapack_func@_common(
+        params, m, n, 
+        fortran_int_min(m, n));
+}
+
+/**end repeat**/
+
 /**begin repeat
    #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
  */
@@ -3486,9 +3490,8 @@ static void
                  /* qr (modes - complete) */
 
 /**begin repeat
-   #TYPE=FLOAT,DOUBLE#
-   #lapack_func=sorgqr,dorgqr#
-   #ftyp=fortran_real,fortran_doublereal#
+   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
+   #ftyp=fortran_real,fortran_doublereal,fortran_complex,fortran_doublecomplex#
  */
 static inline int
 init_@lapack_func@_complete(GQR_PARAMS_t *params,
@@ -3500,94 +3503,6 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
 
 /**end repeat**/
 
-/**begin repeat
-   #TYPE=CFLOAT,CDOUBLE#
-   #lapack_func=cungqr,zungqr#
-   #ftyp=fortran_complex,fortran_doublecomplex#
- */
-static inline int
-init_@lapack_func@_complete(GQR_PARAMS_t *params,
-                            fortran_int m,
-                            fortran_int n)
-{
-    npy_uint8 *mem_buff = NULL;
-    npy_uint8 *mem_buff2 = NULL;
-    npy_uint8 *a, *q, *tau, *work;
-    fortran_int min_m_n = fortran_int_min(m, n);
-    size_t safe_min_m_n = min_m_n;
-    size_t safe_m = m;
-    size_t safe_n = n;
-
-    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t q_size = safe_m * safe_m * sizeof(@ftyp@);
-    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
-
-    fortran_int work_count_r, work_count_i, work_count;
-    size_t work_size;
-    fortran_int lda = fortran_int_max(1, m);
-
-    mem_buff = malloc(q_size + tau_size + a_size);
-
-    if (!mem_buff)
-        goto error;
-
-    q = mem_buff;
-    tau = q + q_size;
-    a = tau + tau_size;
-
-
-    params->M = m;
-    params->MC = m;
-    params->MN = min_m_n;
-    params->A = a;
-    params->Q = q;
-    params->TAU = tau;
-    params->LDA = lda;
-
-    {
-        /* compute optimal work size */
-        @ftyp@ work_size_query;
-
-        params->WORK = &work_size_query;
-        params->LWORK = -1;
-
-        if (call_@lapack_func@(params) != 0)
-            goto error;
-
-        work_count_r = (fortran_int) ((@ftyp@*)params->WORK)->r;
-        work_count_i = (fortran_int) ((@ftyp@*)params->WORK)->i;
-
-        work_count = (fortran_int) sqrt((fortran_doublereal) work_count_r*work_count_r + 
-                                        (fortran_doublereal) work_count_i*work_count_i);
-
-    }
-
-    params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count);
-
-    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
-    mem_buff2 = malloc(work_size);
-    if (!mem_buff2)
-        goto error;
-
-    work = mem_buff2;
-    memset(work, 0, work_size);
-
-    params->WORK = work;
-    params->LWORK = work_count;
-
-    return 1;
- error:
-    TRACE_TXT("%s failed init\n", __FUNCTION__);
-    free(mem_buff);
-    free(mem_buff2);
-    memset(params, 0, sizeof(*params));
-
-    return 0;
-}
-
-/**end repeat**/
-
 /**begin repeat
    #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
    #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#

From b498d65078359fcc88ca41f5f817f2673a90770b Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Thu, 24 Jun 2021 09:00:27 +0530
Subject: [PATCH 22/26] Addressed testing reviews

---
 numpy/linalg/linalg.py            |  4 ++--
 numpy/linalg/tests/test_linalg.py | 40 +++++++++++++++++--------------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 4b3ae79f820b..b7f21b1b70eb 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -100,7 +100,7 @@ def _raise_linalgerror_lstsq(err, flag):
     raise LinAlgError("SVD did not converge in Linear Least Squares")
 
 def _raise_linalgerror_qr(err, flag):
-    raise LinAlgError("Illegal argument found while performing "
+    raise LinAlgError("Incorrect argument found while performing "
                       "QR factorization")
 
 def get_linalg_error_extobj(callback):
@@ -785,7 +785,7 @@ def qr(a, mode='reduced'):
     Parameters
     ----------
     a : array_like, shape (..., M, N)
-        A real or complex array with ``a.ndim >= 2``.
+        An array-like object with the dimensionality of at least 2.
     mode : {'reduced', 'complete', 'r', 'raw'}, optional
         If K = min(M, N), then
 
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index 51f10f27c76e..2fbab8cba65f 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -1729,7 +1729,10 @@ def check_qr_stacked(self, a):
         assert_(q.shape[-2:] == (m, m))
         assert_(r.shape[-2:] == (m, n))
         assert_almost_equal(matmul(q, r), a)
-        assert_almost_equal(swapaxes(q, -1, -2).conj(), np.linalg.inv(q))
+        I_mat = np.identity(q.shape[-1])
+        stack_I_mat = np.broadcast_to(I_mat, 
+                        q.shape[:-2] + (q.shape[-1],)*2)
+        assert_almost_equal(matmul(swapaxes(q, -1, -2).conj(), q), stack_I_mat)
         assert_almost_equal(np.triu(r[..., :, :]), r)
 
         # mode == 'reduced'
@@ -1741,7 +1744,10 @@ def check_qr_stacked(self, a):
         assert_(q1.shape[-2:] == (m, k))
         assert_(r1.shape[-2:] == (k, n))
         assert_almost_equal(matmul(q1, r1), a)
-        assert_almost_equal(swapaxes(q, -1, -2).conj(), np.linalg.inv(q))
+        I_mat = np.identity(q1.shape[-1])
+        stack_I_mat = np.broadcast_to(I_mat, 
+                        q1.shape[:-2] + (q1.shape[-1],)*2)
+        assert_almost_equal(matmul(swapaxes(q1, -1, -2).conj(), q1), stack_I_mat)
         assert_almost_equal(np.triu(r1[..., :, :]), r1)
 
         # mode == 'r'
@@ -1750,22 +1756,20 @@ def check_qr_stacked(self, a):
         assert_(isinstance(r2, a_type))
         assert_almost_equal(r2, r1)
 
-    def test_stacked_inputs(self):
-
-        normal = np.random.normal
-        sizes = [(3, 4), (4, 3), (4, 4), (3, 0), (0, 3)]
-        dts = [np.float32, np.float64, np.complex64]
-        for size in sizes:
-            for dt in dts:
-                a1, a2, a3, a4 = [normal(size=size), normal(size=size),
-                                  normal(size=size), normal(size=size)]
-                b1, b2, b3, b4 = [normal(size=size), normal(size=size),
-                                  normal(size=size), normal(size=size)]
-                A = np.asarray([[a1, a2], [a3, a4]], dtype=dt)
-                B = np.asarray([[b1, b2], [b3, b4]], dtype=dt)
-                self.check_qr_stacked(A)
-                self.check_qr_stacked(B)
-                self.check_qr_stacked(A + 1.j*B)
+    @pytest.mark.parametrize("size",  [
+        (3, 4), (4, 3), (4, 4), 
+        (3, 0), (0, 3)])
+    @pytest.mark.parametrize("outer_size",  [
+        (2, 2), (2,), (2, 3, 4)])
+    @pytest.mark.parametrize("dt",  [
+        np.single, np.double, 
+        np.csingle, np.cdouble])
+    def test_stacked_inputs(self, outer_size, size, dt):
+
+        A = np.random.normal(size=outer_size + size).astype(dt)
+        B = np.random.normal(size=outer_size + size).astype(dt)
+        self.check_qr_stacked(A)
+        self.check_qr_stacked(A + 1.j*B)
 
 
 class TestCholesky:

From e88849df2bb0c6a9c61df1c909bdbc0e87789329 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Thu, 24 Jun 2021 09:20:32 +0530
Subject: [PATCH 23/26] addressed C reviews

---
 numpy/linalg/umath_linalg.c.src | 41 +++++++--------------------------
 1 file changed, 8 insertions(+), 33 deletions(-)

diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 964c4d481d73..1c2a03e8de4a 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -2983,8 +2983,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
 
     }
 
-    params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count > 0 ? work_count : -work_count);
+    params->LWORK = fortran_int_max(fortran_int_max(1, n), work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
     mem_buff2 = malloc(work_size);
@@ -2992,7 +2991,6 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
         goto error;
 
     work = mem_buff2;
-    memset(work, 0, work_size);
 
     params->WORK = work;
 
@@ -3029,7 +3027,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
-    fortran_int work_count_r, work_count_i, work_count;
+    fortran_int work_count;
     size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
 
@@ -3060,11 +3058,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count_r = (fortran_int) ((@ftyp@*)params->WORK)->r;
-        work_count_i = (fortran_int) ((@ftyp@*)params->WORK)->i;
-
-        work_count = (fortran_int) sqrt((fortran_doublereal) work_count_r*work_count_r + 
-                                        (fortran_doublereal) work_count_i*work_count_i);
+        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
 
     }
 
@@ -3078,7 +3072,6 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
         goto error;
 
     work = mem_buff2;
-    memset(work, 0, work_size);
 
     params->WORK = work;
 
@@ -3146,7 +3139,6 @@ static void
                 delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_out);
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[0], &a_in);
                 nan_@TYPE@_matrix(args[1], &tau_out);
             }
         END_OUTER_LOOP
@@ -3251,8 +3243,7 @@ init_@lapack_func@_common(GQR_PARAMS_t *params,
 
     }
 
-    params->LWORK = fortran_int_max(fortran_int_max(1, n),
-                                    work_count > 0 ? work_count : -work_count);
+    params->LWORK = fortran_int_max(fortran_int_max(1, n), work_count);
 
     work_size = (size_t) params->LWORK * sizeof(@ftyp@);
 
@@ -3261,7 +3252,6 @@ init_@lapack_func@_common(GQR_PARAMS_t *params,
         goto error;
 
     work = mem_buff2;
-    memset(work, 0, work_size);
 
     params->WORK = work;
 
@@ -3300,7 +3290,7 @@ init_@lapack_func@_common(GQR_PARAMS_t *params,
     size_t q_size = safe_m * safe_mc * sizeof(@ftyp@);
     size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
-    fortran_int work_count_r, work_count_i, work_count;
+    fortran_int work_count;
     size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
 
@@ -3332,11 +3322,7 @@ init_@lapack_func@_common(GQR_PARAMS_t *params,
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count_r = (fortran_int) ((@ftyp@*)params->WORK)->r;
-        work_count_i = (fortran_int) ((@ftyp@*)params->WORK)->i;
-
-        work_count = (fortran_int) sqrt((fortran_doublereal) work_count_r*work_count_r + 
-                                        (fortran_doublereal) work_count_i*work_count_i);
+        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
 
     }
 
@@ -3350,7 +3336,6 @@ init_@lapack_func@_common(GQR_PARAMS_t *params,
         goto error;
 
     work = mem_buff2;
-    memset(work, 0, work_size);
 
     params->WORK = work;
     params->LWORK = work_count;
@@ -3446,14 +3431,11 @@ static void
     GQR_PARAMS_t params;
     int error_occurred = get_fp_invalid_and_clear();
     fortran_int n, m;
-    size_t safe_min_m_n, safe_m;
 
     INIT_OUTER_LOOP_3
 
     m = (fortran_int)dimensions[0];
     n = (fortran_int)dimensions[1];
-    safe_min_m_n = fortran_int_min(m, n);
-    safe_m = m;
 
     if (init_@lapack_func@(&params, m, n)) {
         LINEARIZE_DATA_t a_in, tau_in, q_out;
@@ -3465,15 +3447,13 @@ static void
         BEGIN_OUTER_LOOP_3
             int not_ok;
             linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            memcpy(params.Q, params.A, safe_min_m_n * safe_m * sizeof(@ftyp@));
+            linearize_@TYPE@_matrix(params.Q, args[0], &a_in);
             linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
             not_ok = call_@lapack_func@(&params);
             if (!not_ok) {
-                delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_in);
                 delinearize_@TYPE@_matrix(args[2], params.Q, &q_out);
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[1], &tau_in);
                 nan_@TYPE@_matrix(args[2], &q_out);
             }
         END_OUTER_LOOP
@@ -3520,14 +3500,11 @@ static void
     GQR_PARAMS_t params;
     int error_occurred = get_fp_invalid_and_clear();
     fortran_int n, m;
-    size_t safe_n, safe_m;
 
     INIT_OUTER_LOOP_3
 
     m = (fortran_int)dimensions[0];
     n = (fortran_int)dimensions[1];
-    safe_n = n;
-    safe_m = m;
 
 
     if (init_@lapack_func@_complete(&params, m, n)) {
@@ -3540,15 +3517,13 @@ static void
         BEGIN_OUTER_LOOP_3
             int not_ok;
             linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            memcpy(params.Q, params.A, safe_n * safe_m * sizeof(@ftyp@));
+            linearize_@TYPE@_matrix(params.Q, args[0], &a_in);
             linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
             not_ok = call_@lapack_func@(&params);
             if (!not_ok) {
-                delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_in);
                 delinearize_@TYPE@_matrix(args[2], params.Q, &q_out);
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[1], &tau_in);
                 nan_@TYPE@_matrix(args[2], &q_out);
             }
         END_OUTER_LOOP

From c1f500b5d2f61cef242417a86da806bcef7faeb6 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Thu, 24 Jun 2021 09:33:09 +0530
Subject: [PATCH 24/26] fixed linting issues

---
 numpy/linalg/tests/test_linalg.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index 2fbab8cba65f..4c54c0b534f6 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -1747,7 +1747,8 @@ def check_qr_stacked(self, a):
         I_mat = np.identity(q1.shape[-1])
         stack_I_mat = np.broadcast_to(I_mat, 
                         q1.shape[:-2] + (q1.shape[-1],)*2)
-        assert_almost_equal(matmul(swapaxes(q1, -1, -2).conj(), q1), stack_I_mat)
+        assert_almost_equal(matmul(swapaxes(q1, -1, -2).conj(), q1), 
+                            stack_I_mat)
         assert_almost_equal(np.triu(r1[..., :, :]), r1)
 
         # mode == 'r'
@@ -1756,12 +1757,12 @@ def check_qr_stacked(self, a):
         assert_(isinstance(r2, a_type))
         assert_almost_equal(r2, r1)
 
-    @pytest.mark.parametrize("size",  [
+    @pytest.mark.parametrize("size", [
         (3, 4), (4, 3), (4, 4), 
         (3, 0), (0, 3)])
-    @pytest.mark.parametrize("outer_size",  [
+    @pytest.mark.parametrize("outer_size", [
         (2, 2), (2,), (2, 3, 4)])
-    @pytest.mark.parametrize("dt",  [
+    @pytest.mark.parametrize("dt", [
         np.single, np.double, 
         np.csingle, np.cdouble])
     def test_stacked_inputs(self, outer_size, size, dt):

From 6ab34ef44aede6b09148ed6868c8ad2b0846e8e4 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Fri, 9 Jul 2021 14:04:26 +0530
Subject: [PATCH 25/26] removed redundant functions

---
 numpy/linalg/umath_linalg.c.src | 116 ++++++++++++++------------------
 1 file changed, 49 insertions(+), 67 deletions(-)

diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 1c2a03e8de4a..27fe51732b93 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -163,14 +163,6 @@ FNAME(zgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs,
               fortran_int *info);
 
 extern fortran_int
-FNAME(sgeqrf)(fortran_int *m, fortran_int *n, float a[], fortran_int *lda,
-              float tau[], float work[],
-              fortran_int *lwork, fortran_int *info);
-extern fortran_int
-FNAME(cgeqrf)(fortran_int *m, fortran_int *n, f2c_complex a[], fortran_int *lda,
-              f2c_complex tau[], f2c_complex work[],
-              fortran_int *lwork, fortran_int *info);
-extern fortran_int
 FNAME(dgeqrf)(fortran_int *m, fortran_int *n, double a[], fortran_int *lda,
               double tau[], double work[],
               fortran_int *lwork, fortran_int *info);
@@ -180,18 +172,10 @@ FNAME(zgeqrf)(fortran_int *m, fortran_int *n, f2c_doublecomplex a[], fortran_int
               fortran_int *lwork, fortran_int *info);
 
 extern fortran_int
-FNAME(sorgqr)(fortran_int *m, fortran_int *n, fortran_int *k, float a[], fortran_int *lda,
-              float tau[], float work[],
-              fortran_int *lwork, fortran_int *info);
-extern fortran_int
 FNAME(dorgqr)(fortran_int *m, fortran_int *n, fortran_int *k, double a[], fortran_int *lda,
               double tau[], double work[],
               fortran_int *lwork, fortran_int *info);
 extern fortran_int
-FNAME(cungqr)(fortran_int *m, fortran_int *n, fortran_int *k, f2c_complex a[],
-              fortran_int *lda, f2c_complex tau[],
-              f2c_complex work[], fortran_int *lwork, fortran_int *info);
-extern fortran_int
 FNAME(zungqr)(fortran_int *m, fortran_int *n, fortran_int *k, f2c_doublecomplex a[],
               fortran_int *lda, f2c_doublecomplex tau[],
               f2c_doublecomplex work[], fortran_int *lwork, fortran_int *info);
@@ -2910,7 +2894,7 @@ dump_geqrf_params(const char *name,
 }
 
 /**begin repeat
-   #lapack_func=sgeqrf,dgeqrf,cgeqrf,zgeqrf#
+   #lapack_func=dgeqrf,zgeqrf#
  */
 
 static inline fortran_int
@@ -2928,9 +2912,9 @@ call_@lapack_func@(GEQRF_PARAMS_t *params)
 /**end repeat**/
 
 /**begin repeat
-   #TYPE=FLOAT,DOUBLE#
-   #lapack_func=sgeqrf,dgeqrf#
-   #ftyp=fortran_real,fortran_doublereal#
+   #TYPE=DOUBLE#
+   #lapack_func=dgeqrf#
+   #ftyp=fortran_doublereal#
  */
 static inline int
 init_@lapack_func@(GEQRF_PARAMS_t *params,
@@ -3007,9 +2991,9 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
 /**end repeat**/
 
 /**begin repeat
-   #TYPE=CFLOAT,CDOUBLE#
-   #lapack_func=cgeqrf,zgeqrf#
-   #ftyp=fortran_complex,fortran_doublecomplex#
+   #TYPE=CDOUBLE#
+   #lapack_func=zgeqrf#
+   #ftyp=fortran_doublecomplex#
  */
 static inline int
 init_@lapack_func@(GEQRF_PARAMS_t *params,
@@ -3088,7 +3072,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
 /**end repeat**/
 
 /**begin repeat
-   #lapack_func=sgeqrf,dgeqrf,cgeqrf,zgeqrf#
+   #lapack_func=dgeqrf,zgeqrf#
  */
 static inline void
 release_@lapack_func@(GEQRF_PARAMS_t* params)
@@ -3102,14 +3086,13 @@ release_@lapack_func@(GEQRF_PARAMS_t* params)
 /**end repeat**/
 
 /**begin repeat
-   #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
-   #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
-   #lapack_func=sgeqrf,dgeqrf,cgeqrf,zgeqrf#
-   #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
-   #basetyp = npy_float, npy_double, npy_float,  npy_double#
-   #ftyp = fortran_real, fortran_doublereal,
-           fortran_complex, fortran_doublecomplex#
-   #cmplx = 0, 0, 1, 1#
+   #TYPE=DOUBLE,CDOUBLE#
+   #REALTYPE=DOUBLE,DOUBLE#
+   #lapack_func=dgeqrf,zgeqrf#
+   #typ     = npy_double,npy_cdouble#
+   #basetyp = npy_double,npy_double#
+   #ftyp = fortran_doublereal,fortran_doublecomplex#
+   #cmplx = 0, 1#
  */
 static void
 @TYPE@_qr_r_raw(char **args, npy_intp const *dimensions, npy_intp const *steps,
@@ -3168,7 +3151,7 @@ typedef struct gqr_params_struct
 } GQR_PARAMS_t;
 
 /**begin repeat
-   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
+   #lapack_func=dorgqr,zungqr#
  */
 
 static inline fortran_int
@@ -3186,8 +3169,8 @@ call_@lapack_func@(GQR_PARAMS_t *params)
 /**end repeat**/
 
 /**begin repeat
-   #lapack_func=sorgqr,dorgqr#
-   #ftyp=fortran_real,fortran_doublereal#
+   #lapack_func=dorgqr#
+   #ftyp=fortran_doublereal#
  */
 static inline int
 init_@lapack_func@_common(GQR_PARAMS_t *params,
@@ -3268,8 +3251,8 @@ init_@lapack_func@_common(GQR_PARAMS_t *params,
 /**end repeat**/
 
 /**begin repeat
-   #lapack_func=cungqr,zungqr#
-   #ftyp=fortran_complex,fortran_doublecomplex#
+   #lapack_func=zungqr#
+   #ftyp=fortran_doublecomplex#
  */
 static inline int
 init_@lapack_func@_common(GQR_PARAMS_t *params,
@@ -3385,8 +3368,8 @@ dump_gqr_params(const char *name,
 }
 
 /**begin repeat
-   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
-   #ftyp=fortran_real,fortran_doublereal,fortran_complex,fortran_doublecomplex#
+   #lapack_func=dorgqr,zungqr#
+   #ftyp=fortran_doublereal,fortran_doublecomplex#
  */
 static inline int
 init_@lapack_func@(GQR_PARAMS_t *params,
@@ -3401,7 +3384,7 @@ init_@lapack_func@(GQR_PARAMS_t *params,
 /**end repeat**/
 
 /**begin repeat
-   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
+   #lapack_func=dorgqr,zungqr#
  */
 static inline void
 release_@lapack_func@(GQR_PARAMS_t* params)
@@ -3415,14 +3398,13 @@ release_@lapack_func@(GQR_PARAMS_t* params)
 /**end repeat**/
 
 /**begin repeat
-   #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
-   #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
-   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
-   #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
-   #basetyp = npy_float, npy_double, npy_float,  npy_double#
-   #ftyp = fortran_real, fortran_doublereal,
-           fortran_complex, fortran_doublecomplex#
-   #cmplx = 0, 0, 1, 1#
+   #TYPE=DOUBLE,CDOUBLE#
+   #REALTYPE=DOUBLE,DOUBLE#
+   #lapack_func=dorgqr,zungqr#
+   #typ     = npy_double, npy_cdouble#
+   #basetyp = npy_double, npy_double#
+   #ftyp = fortran_doublereal,fortran_doublecomplex#
+   #cmplx = 0, 1#
  */
 static void
 @TYPE@_qr_reduced(char **args, npy_intp const *dimensions, npy_intp const *steps,
@@ -3470,8 +3452,8 @@ static void
                  /* qr (modes - complete) */
 
 /**begin repeat
-   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
-   #ftyp=fortran_real,fortran_doublereal,fortran_complex,fortran_doublecomplex#
+   #lapack_func=dorgqr,zungqr#
+   #ftyp=fortran_doublereal,fortran_doublecomplex#
  */
 static inline int
 init_@lapack_func@_complete(GQR_PARAMS_t *params,
@@ -3484,14 +3466,13 @@ init_@lapack_func@_complete(GQR_PARAMS_t *params,
 /**end repeat**/
 
 /**begin repeat
-   #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
-   #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
-   #lapack_func=sorgqr,dorgqr,cungqr,zungqr#
-   #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
-   #basetyp = npy_float, npy_double, npy_float,  npy_double#
-   #ftyp = fortran_real, fortran_doublereal,
-           fortran_complex, fortran_doublecomplex#
-   #cmplx = 0, 0, 1, 1#
+   #TYPE=DOUBLE,CDOUBLE#
+   #REALTYPE=DOUBLE,DOUBLE#
+   #lapack_func=dorgqr,zungqr#
+   #typ     = npy_double,npy_cdouble#
+   #basetyp = npy_double,npy_double#
+   #ftyp = fortran_doublereal,fortran_doublecomplex#
+   #cmplx = 0, 1#
  */
 static void
 @TYPE@_qr_complete(char **args, npy_intp const *dimensions, npy_intp const *steps,
@@ -3992,6 +3973,13 @@ static void *array_of_nulls[] = {
         CDOUBLE_ ## NAME                                \
     }
 
+#define GUFUNC_FUNC_ARRAY_QR(NAME)                      \
+    static PyUFuncGenericFunction                       \
+    FUNC_ARRAY_NAME(NAME)[] = {                         \
+        DOUBLE_ ## NAME,                                \
+        CDOUBLE_ ## NAME                                \
+    }
+
 
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(slogdet);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(det);
@@ -4006,9 +3994,9 @@ GUFUNC_FUNC_ARRAY_REAL_COMPLEX(cholesky_lo);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_N);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_S);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(qr_r_raw);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(qr_reduced);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(qr_complete);
+GUFUNC_FUNC_ARRAY_QR(qr_r_raw);
+GUFUNC_FUNC_ARRAY_QR(qr_reduced);
+GUFUNC_FUNC_ARRAY_QR(qr_complete);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(lstsq);
 GUFUNC_FUNC_ARRAY_EIG(eig);
 GUFUNC_FUNC_ARRAY_EIG(eigvals);
@@ -4077,25 +4065,19 @@ static char svd_1_3_types[] = {
 
 /* A, tau */
 static char qr_r_raw_types[] = {
-    NPY_FLOAT,   NPY_FLOAT,
     NPY_DOUBLE,  NPY_DOUBLE,
-    NPY_CFLOAT,  NPY_CFLOAT,
     NPY_CDOUBLE, NPY_CDOUBLE,
 };
 
 /* A, tau, q */
 static char qr_reduced_types[] = {
-    NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,
     NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE,
-    NPY_CFLOAT,  NPY_CFLOAT,  NPY_CFLOAT,
     NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
 };
 
 /* A, tau, q */
 static char qr_complete_types[] = {
-    NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,
     NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE,
-    NPY_CFLOAT,  NPY_CFLOAT,  NPY_CFLOAT,
     NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
 };
 

From 6e405d53a504d6f97c8b1227d7f4d3c3c1aa2834 Mon Sep 17 00:00:00 2001
From: czgdp1807 <gdp.1807@gmail.com>
Date: Sat, 10 Jul 2021 11:27:30 +0530
Subject: [PATCH 26/26] removed redudancy in code

---
 numpy/linalg/linalg.py          | 26 +++++++---------------
 numpy/linalg/umath_linalg.c.src | 38 +++++++++++----------------------
 2 files changed, 20 insertions(+), 44 deletions(-)

diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index b7f21b1b70eb..560b7e9dc40b 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -940,20 +940,17 @@ def qr(a, mode='reduced'):
     # handle modes that don't return q
     if mode == 'r':
         r = triu(a[..., :mn, :])
-        if t != result_t:
-            r = r.astype(result_t, copy=False)
+        r = r.astype(result_t, copy=False)
         return wrap(r)
 
     if mode == 'raw':
         q = transpose(a)
-        if t != result_t:
-            q = q.astype(result_t, copy=False)
-            tau = tau.astype(result_t, copy=False)
+        q = q.astype(result_t, copy=False)
+        tau = tau.astype(result_t, copy=False)
         return wrap(q), tau
 
     if mode == 'economic':
-        if t != result_t :
-            a = a.astype(result_t, copy=False)
+        a = a.astype(result_t, copy=False)
         return wrap(a)
 
     # mc is the number of columns in the resulting q
@@ -962,25 +959,18 @@ def qr(a, mode='reduced'):
     # then it is the minimum of number of rows and columns.
     if mode == 'complete' and m > n:
         mc = m
-        if m <= n:
-            gufunc = _umath_linalg.qr_complete_m
-        else:
-            gufunc = _umath_linalg.qr_complete_n
+        gufunc = _umath_linalg.qr_complete
     else:
         mc = mn
-        if m <= n:
-            gufunc = _umath_linalg.qr_reduced_m
-        else:
-            gufunc = _umath_linalg.qr_reduced_n
+        gufunc = _umath_linalg.qr_reduced
 
     signature = 'DD->D' if isComplexType(t) else 'dd->d'
     extobj = get_linalg_error_extobj(_raise_linalgerror_qr)
     q = gufunc(a, tau, signature=signature, extobj=extobj)
     r = triu(a[..., :mc, :])
 
-    if t != result_t:
-        q = q.astype(result_t, copy=False)
-        r = r.astype(result_t, copy=False)
+    q = q.astype(result_t, copy=False)
+    r = r.astype(result_t, copy=False)
 
     return wrap(q), wrap(r)
 
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 27fe51732b93..a486e9e5b41c 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -3973,6 +3973,10 @@ static void *array_of_nulls[] = {
         CDOUBLE_ ## NAME                                \
     }
 
+/* The single precision functions are not used at all,
+ * due to input data being promoted to double precision
+ * in Python, so they are not implemented here.
+ */
 #define GUFUNC_FUNC_ARRAY_QR(NAME)                      \
     static PyUFuncGenericFunction                       \
     FUNC_ARRAY_NAME(NAME)[] = {                         \
@@ -4285,7 +4289,7 @@ GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
         "(m,n)->(m)",
         "Compute TAU vector for the last two dimensions \n"\
         "and broadcast to the rest. For m <= n. \n",
-        4, 1, 1,
+        2, 1, 1,
         FUNC_ARRAY_NAME(qr_r_raw),
         qr_r_raw_types
     },
@@ -4294,43 +4298,25 @@ GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
         "(m,n)->(n)",
         "Compute TAU vector for the last two dimensions \n"\
         "and broadcast to the rest. For m > n. \n",
-        4, 1, 1,
+        2, 1, 1,
         FUNC_ARRAY_NAME(qr_r_raw),
         qr_r_raw_types
     },
     {
-        "qr_reduced_m",
-        "(m,n),(m)->(m,m)",
-        "Compute Q matrix for the last two dimensions \n"\
-        "and broadcast to the rest. For m <= n. \n",
-        4, 2, 1,
-        FUNC_ARRAY_NAME(qr_reduced),
-        qr_reduced_types
-    },
-    {
-        "qr_reduced_n",
-        "(m,n),(n)->(m,n)",
+        "qr_reduced",
+        "(m,n),(k)->(m,k)",
         "Compute Q matrix for the last two dimensions \n"\
-        "and broadcast to the rest. For m > n. \n",
-        4, 2, 1,
+        "and broadcast to the rest. \n",
+        2, 2, 1,
         FUNC_ARRAY_NAME(qr_reduced),
         qr_reduced_types
     },
     {
-        "qr_complete_m",
-        "(m,n),(m)->(m,m)",
-        "Compute Q matrix for the last two dimensions \n"\
-        "and broadcast to the rest. For m <= n. \n",
-        4, 2, 1,
-        FUNC_ARRAY_NAME(qr_complete),
-        qr_complete_types
-    },
-    {
-        "qr_complete_n",
+        "qr_complete",
         "(m,n),(n)->(m,m)",
         "Compute Q matrix for the last two dimensions \n"\
         "and broadcast to the rest. For m > n. \n",
-        4, 2, 1,
+        2, 2, 1,
         FUNC_ARRAY_NAME(qr_complete),
         qr_complete_types
     },