diff --git a/doc/release/upcoming_changes/19151.improvement.rst b/doc/release/upcoming_changes/19151.improvement.rst
new file mode 100644
index 000000000000..2108b9c4f96d
--- /dev/null
+++ b/doc/release/upcoming_changes/19151.improvement.rst
@@ -0,0 +1,6 @@
+`numpy.linalg.qr` accepts stacked matrices as inputs
+----------------------------------------------------
+
+`numpy.linalg.qr` is able to produce results for stacked matrices as inputs.
+Moreover, the implementation of QR decomposition has been shifted to C
+from Python.
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 46fb2502e5cc..560b7e9dc40b 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -99,6 +99,10 @@ def _raise_linalgerror_svd_nonconvergence(err, flag):
 def _raise_linalgerror_lstsq(err, flag):
     raise LinAlgError("SVD did not converge in Linear Least Squares")
 
+def _raise_linalgerror_qr(err, flag):
+    raise LinAlgError("Incorrect argument found while performing "
+                      "QR factorization")
+
 def get_linalg_error_extobj(callback):
     extobj = list(_linalg_error_extobj)  # make a copy
     extobj[2] = callback
@@ -780,15 +784,16 @@ def qr(a, mode='reduced'):
 
     Parameters
     ----------
-    a : array_like, shape (M, N)
-        Matrix to be factored.
+    a : array_like, shape (..., M, N)
+        An array-like object with the dimensionality of at least 2.
     mode : {'reduced', 'complete', 'r', 'raw'}, optional
         If K = min(M, N), then
 
-        * 'reduced'  : returns q, r with dimensions (M, K), (K, N) (default)
-        * 'complete' : returns q, r with dimensions (M, M), (M, N)
-        * 'r'        : returns r only with dimensions (K, N)
-        * 'raw'      : returns h, tau with dimensions (N, M), (K,)
+        * 'reduced'  : returns q, r with dimensions
+                       (..., M, K), (..., K, N) (default)
+        * 'complete' : returns q, r with dimensions (..., M, M), (..., M, N)
+        * 'r'        : returns r only with dimensions (..., K, N)
+        * 'raw'      : returns h, tau with dimensions (..., N, M), (..., K,)
 
         The options 'reduced', 'complete, and 'raw' are new in numpy 1.8,
         see the notes for more information. The default is 'reduced', and to
@@ -807,9 +812,13 @@ def qr(a, mode='reduced'):
         A matrix with orthonormal columns. When mode = 'complete' the
         result is an orthogonal/unitary matrix depending on whether or not
         a is real/complex. The determinant may be either +/- 1 in that
-        case.
+        case. In case the number of dimensions in the input array is
+        greater than 2 then a stack of the matrices with above properties
+        is returned.
     r : ndarray of float or complex, optional
-        The upper-triangular matrix.
+        The upper-triangular matrix or a stack of upper-triangular
+        matrices if the number of dimensions in the input array is greater
+        than 2.
     (h, tau) : ndarrays of np.double or np.cdouble, optional
         The array h contains the Householder reflectors that generate q
         along with r. The tau array contains scaling factors for the
@@ -857,6 +866,14 @@ def qr(a, mode='reduced'):
     >>> r2 = np.linalg.qr(a, mode='r')
     >>> np.allclose(r, r2)  # mode='r' returns the same r as mode='full'
     True
+    >>> a = np.random.normal(size=(3, 2, 2)) # Stack of 2 x 2 matrices as input
+    >>> q, r = np.linalg.qr(a)
+    >>> q.shape
+    (3, 2, 2)
+    >>> r.shape
+    (3, 2, 2)
+    >>> np.allclose(a, np.matmul(q, r))
+    True
 
     Example illustrating a common use of `qr`: solving of least squares
     problems
@@ -904,83 +921,58 @@ def qr(a, mode='reduced'):
             raise ValueError(f"Unrecognized mode '{mode}'")
 
     a, wrap = _makearray(a)
-    _assert_2d(a)
-    m, n = a.shape
+    _assert_stacked_2d(a)
+    m, n = a.shape[-2:]
     t, result_t = _commonType(a)
-    a = _fastCopyAndTranspose(t, a)
+    a = a.astype(t, copy=True)
     a = _to_native_byte_order(a)
     mn = min(m, n)
-    tau = zeros((mn,), t)
 
-    if isComplexType(t):
-        lapack_routine = lapack_lite.zgeqrf
-        routine_name = 'zgeqrf'
+    if m <= n:
+        gufunc = _umath_linalg.qr_r_raw_m
     else:
-        lapack_routine = lapack_lite.dgeqrf
-        routine_name = 'dgeqrf'
-
-    # calculate optimal size of work data 'work'
-    lwork = 1
-    work = zeros((lwork,), t)
-    results = lapack_routine(m, n, a, max(1, m), tau, work, -1, 0)
-    if results['info'] != 0:
-        raise LinAlgError('%s returns %d' % (routine_name, results['info']))
-
-    # do qr decomposition
-    lwork = max(1, n, int(abs(work[0])))
-    work = zeros((lwork,), t)
-    results = lapack_routine(m, n, a, max(1, m), tau, work, lwork, 0)
-    if results['info'] != 0:
-        raise LinAlgError('%s returns %d' % (routine_name, results['info']))
+        gufunc = _umath_linalg.qr_r_raw_n
+
+    signature = 'D->D' if isComplexType(t) else 'd->d'
+    extobj = get_linalg_error_extobj(_raise_linalgerror_qr)
+    tau = gufunc(a, signature=signature, extobj=extobj)
 
     # handle modes that don't return q
     if mode == 'r':
-        r = _fastCopyAndTranspose(result_t, a[:, :mn])
-        return wrap(triu(r))
+        r = triu(a[..., :mn, :])
+        r = r.astype(result_t, copy=False)
+        return wrap(r)
 
     if mode == 'raw':
-        return a, tau
+        q = transpose(a)
+        q = q.astype(result_t, copy=False)
+        tau = tau.astype(result_t, copy=False)
+        return wrap(q), tau
 
     if mode == 'economic':
-        if t != result_t :
-            a = a.astype(result_t, copy=False)
-        return wrap(a.T)
+        a = a.astype(result_t, copy=False)
+        return wrap(a)
 
-    #  generate q from a
+    # mc is the number of columns in the resulting q
+    # matrix. If the mode is complete then it is 
+    # same as number of rows, and if the mode is reduced,
+    # then it is the minimum of number of rows and columns.
     if mode == 'complete' and m > n:
         mc = m
-        q = empty((m, m), t)
+        gufunc = _umath_linalg.qr_complete
     else:
         mc = mn
-        q = empty((n, m), t)
-    q[:n] = a
-
-    if isComplexType(t):
-        lapack_routine = lapack_lite.zungqr
-        routine_name = 'zungqr'
-    else:
-        lapack_routine = lapack_lite.dorgqr
-        routine_name = 'dorgqr'
+        gufunc = _umath_linalg.qr_reduced
 
-    # determine optimal lwork
-    lwork = 1
-    work = zeros((lwork,), t)
-    results = lapack_routine(m, mc, mn, q, max(1, m), tau, work, -1, 0)
-    if results['info'] != 0:
-        raise LinAlgError('%s returns %d' % (routine_name, results['info']))
-
-    # compute q
-    lwork = max(1, n, int(abs(work[0])))
-    work = zeros((lwork,), t)
-    results = lapack_routine(m, mc, mn, q, max(1, m), tau, work, lwork, 0)
-    if results['info'] != 0:
-        raise LinAlgError('%s returns %d' % (routine_name, results['info']))
-
-    q = _fastCopyAndTranspose(result_t, q[:mc])
-    r = _fastCopyAndTranspose(result_t, a[:, :mc])
+    signature = 'DD->D' if isComplexType(t) else 'dd->d'
+    extobj = get_linalg_error_extobj(_raise_linalgerror_qr)
+    q = gufunc(a, tau, signature=signature, extobj=extobj)
+    r = triu(a[..., :mc, :])
 
-    return wrap(q), wrap(triu(r))
+    q = q.astype(result_t, copy=False)
+    r = r.astype(result_t, copy=False)
 
+    return wrap(q), wrap(r)
 
 # Eigenvalues
 
@@ -2177,7 +2169,7 @@ def lstsq(a, b, rcond="warn"):
     equal to, or greater than its number of linearly independent columns).
     If `a` is square and of full rank, then `x` (but for round-off error)
     is the "exact" solution of the equation. Else, `x` minimizes the
-    Euclidean 2-norm :math:`||b - ax||`. If there are multiple minimizing 
+    Euclidean 2-norm :math:`||b - ax||`. If there are multiple minimizing
     solutions, the one with the smallest 2-norm :math:`||x||` is returned.
 
     Parameters
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index c6e8cdd039f1..4c54c0b534f6 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -1,6 +1,7 @@
 """ Test functions for linalg module
 
 """
+from numpy.core.fromnumeric import shape
 import os
 import sys
 import itertools
@@ -11,6 +12,7 @@
 
 import numpy as np
 from numpy import array, single, double, csingle, cdouble, dot, identity, matmul
+from numpy.core import swapaxes
 from numpy import multiply, atleast_2d, inf, asarray
 from numpy import linalg
 from numpy.linalg import matrix_power, norm, matrix_rank, multi_dot, LinAlgError
@@ -1710,6 +1712,66 @@ def test_mode_all_but_economic(self):
             self.check_qr(m2)
             self.check_qr(m2.T)
 
+    def check_qr_stacked(self, a):
+        # This test expects the argument `a` to be an ndarray or
+        # a subclass of an ndarray of inexact type.
+        a_type = type(a)
+        a_dtype = a.dtype
+        m, n = a.shape[-2:]
+        k = min(m, n)
+
+        # mode == 'complete'
+        q, r = linalg.qr(a, mode='complete')
+        assert_(q.dtype == a_dtype)
+        assert_(r.dtype == a_dtype)
+        assert_(isinstance(q, a_type))
+        assert_(isinstance(r, a_type))
+        assert_(q.shape[-2:] == (m, m))
+        assert_(r.shape[-2:] == (m, n))
+        assert_almost_equal(matmul(q, r), a)
+        I_mat = np.identity(q.shape[-1])
+        stack_I_mat = np.broadcast_to(I_mat, 
+                        q.shape[:-2] + (q.shape[-1],)*2)
+        assert_almost_equal(matmul(swapaxes(q, -1, -2).conj(), q), stack_I_mat)
+        assert_almost_equal(np.triu(r[..., :, :]), r)
+
+        # mode == 'reduced'
+        q1, r1 = linalg.qr(a, mode='reduced')
+        assert_(q1.dtype == a_dtype)
+        assert_(r1.dtype == a_dtype)
+        assert_(isinstance(q1, a_type))
+        assert_(isinstance(r1, a_type))
+        assert_(q1.shape[-2:] == (m, k))
+        assert_(r1.shape[-2:] == (k, n))
+        assert_almost_equal(matmul(q1, r1), a)
+        I_mat = np.identity(q1.shape[-1])
+        stack_I_mat = np.broadcast_to(I_mat, 
+                        q1.shape[:-2] + (q1.shape[-1],)*2)
+        assert_almost_equal(matmul(swapaxes(q1, -1, -2).conj(), q1), 
+                            stack_I_mat)
+        assert_almost_equal(np.triu(r1[..., :, :]), r1)
+
+        # mode == 'r'
+        r2 = linalg.qr(a, mode='r')
+        assert_(r2.dtype == a_dtype)
+        assert_(isinstance(r2, a_type))
+        assert_almost_equal(r2, r1)
+
+    @pytest.mark.parametrize("size", [
+        (3, 4), (4, 3), (4, 4), 
+        (3, 0), (0, 3)])
+    @pytest.mark.parametrize("outer_size", [
+        (2, 2), (2,), (2, 3, 4)])
+    @pytest.mark.parametrize("dt", [
+        np.single, np.double, 
+        np.csingle, np.cdouble])
+    def test_stacked_inputs(self, outer_size, size, dt):
+
+        A = np.random.normal(size=outer_size + size).astype(dt)
+        B = np.random.normal(size=outer_size + size).astype(dt)
+        self.check_qr_stacked(A)
+        self.check_qr_stacked(A + 1.j*B)
+
 
 class TestCholesky:
     # TODO: are there no other tests for cholesky?
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 1807aadcf584..a486e9e5b41c 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -162,6 +162,24 @@ FNAME(zgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs,
               double rwork[], fortran_int iwork[],
               fortran_int *info);
 
+extern fortran_int
+FNAME(dgeqrf)(fortran_int *m, fortran_int *n, double a[], fortran_int *lda,
+              double tau[], double work[],
+              fortran_int *lwork, fortran_int *info);
+extern fortran_int
+FNAME(zgeqrf)(fortran_int *m, fortran_int *n, f2c_doublecomplex a[], fortran_int *lda,
+              f2c_doublecomplex tau[], f2c_doublecomplex work[],
+              fortran_int *lwork, fortran_int *info);
+
+extern fortran_int
+FNAME(dorgqr)(fortran_int *m, fortran_int *n, fortran_int *k, double a[], fortran_int *lda,
+              double tau[], double work[],
+              fortran_int *lwork, fortran_int *info);
+extern fortran_int
+FNAME(zungqr)(fortran_int *m, fortran_int *n, fortran_int *k, f2c_doublecomplex a[],
+              fortran_int *lda, f2c_doublecomplex tau[],
+              f2c_doublecomplex work[], fortran_int *lwork, fortran_int *info);
+
 extern fortran_int
 FNAME(sgesv)(fortran_int *n, fortran_int *nrhs,
              float a[], fortran_int *lda,
@@ -2834,170 +2852,131 @@ static void
 
 /**end repeat**/
 
-
 /* -------------------------------------------------------------------------- */
-                 /* least squares */
+                 /* qr (modes - r, raw) */
 
-typedef struct gelsd_params_struct
+typedef struct geqfr_params_struct
 {
     fortran_int M;
     fortran_int N;
-    fortran_int NRHS;
     void *A;
     fortran_int LDA;
-    void *B;
-    fortran_int LDB;
-    void *S;
-    void *RCOND;
-    fortran_int RANK;
+    void* TAU;
     void *WORK;
     fortran_int LWORK;
-    void *RWORK;
-    void *IWORK;
-} GELSD_PARAMS_t;
+} GEQRF_PARAMS_t;
 
 
 static inline void
-dump_gelsd_params(const char *name,
-                  GELSD_PARAMS_t *params)
+dump_geqrf_params(const char *name,
+                  GEQRF_PARAMS_t *params)
 {
     TRACE_TXT("\n%s:\n"\
 
               "%14s: %18p\n"\
               "%14s: %18p\n"\
               "%14s: %18p\n"\
-              "%14s: %18p\n"\
-              "%14s: %18p\n"\
-              "%14s: %18p\n"\
-
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
-              "%14s: %18d\n"\
               "%14s: %18d\n"\
               "%14s: %18d\n"\
               "%14s: %18d\n"\
-
-              "%14s: %18p\n",
+              "%14s: %18d\n",
 
               name,
 
               "A", params->A,
-              "B", params->B,
-              "S", params->S,
+              "TAU", params->TAU,
               "WORK", params->WORK,
-              "RWORK", params->RWORK,
-              "IWORK", params->IWORK,
 
               "M", (int)params->M,
               "N", (int)params->N,
-              "NRHS", (int)params->NRHS,
               "LDA", (int)params->LDA,
-              "LDB", (int)params->LDB,
-              "LWORK", (int)params->LWORK,
-              "RANK", (int)params->RANK,
-
-              "RCOND", params->RCOND);
+              "LWORK", (int)params->LWORK);
 }
 
-
 /**begin repeat
-   #TYPE=FLOAT,DOUBLE#
-   #lapack_func=sgelsd,dgelsd#
-   #ftyp=fortran_real,fortran_doublereal#
+   #lapack_func=dgeqrf,zgeqrf#
  */
 
 static inline fortran_int
-call_@lapack_func@(GELSD_PARAMS_t *params)
+call_@lapack_func@(GEQRF_PARAMS_t *params)
 {
     fortran_int rv;
-    LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
+    LAPACK(@lapack_func@)(&params->M, &params->N,
                           params->A, &params->LDA,
-                          params->B, &params->LDB,
-                          params->S,
-                          params->RCOND, &params->RANK,
+                          params->TAU,
                           params->WORK, &params->LWORK,
-                          params->IWORK,
                           &rv);
     return rv;
 }
 
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=DOUBLE#
+   #lapack_func=dgeqrf#
+   #ftyp=fortran_doublereal#
+ */
 static inline int
-init_@lapack_func@(GELSD_PARAMS_t *params,
+init_@lapack_func@(GEQRF_PARAMS_t *params,
                    fortran_int m,
-                   fortran_int n,
-                   fortran_int nrhs)
+                   fortran_int n)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
-    npy_uint8 *a, *b, *s, *work, *iwork;
+    npy_uint8 *a, *tau, *work;
     fortran_int min_m_n = fortran_int_min(m, n);
-    fortran_int max_m_n = fortran_int_max(m, n);
     size_t safe_min_m_n = min_m_n;
-    size_t safe_max_m_n = max_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
-    size_t safe_nrhs = nrhs;
 
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
-    size_t s_size = safe_min_m_n * sizeof(@ftyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
     fortran_int work_count;
     size_t work_size;
-    size_t iwork_size;
     fortran_int lda = fortran_int_max(1, m);
-    fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
 
-    mem_buff = malloc(a_size + b_size + s_size);
+    mem_buff = malloc(a_size + tau_size);
 
     if (!mem_buff)
         goto error;
 
     a = mem_buff;
-    b = a + a_size;
-    s = b + b_size;
+    tau = a + a_size;
+    memset(tau, 0, tau_size);
 
 
     params->M = m;
     params->N = n;
-    params->NRHS = nrhs;
     params->A = a;
-    params->B = b;
-    params->S = s;
+    params->TAU = tau;
     params->LDA = lda;
-    params->LDB = ldb;
 
     {
         /* compute optimal work size */
+
         @ftyp@ work_size_query;
-        fortran_int iwork_size_query;
 
         params->WORK = &work_size_query;
-        params->IWORK = &iwork_size_query;
-        params->RWORK = NULL;
         params->LWORK = -1;
 
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int)work_size_query;
+        work_count = (fortran_int) *(@ftyp@*) params->WORK;
 
-        work_size  = (size_t) work_size_query * sizeof(@ftyp@);
-        iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
     }
 
-    mem_buff2 = malloc(work_size + iwork_size);
+    params->LWORK = fortran_int_max(fortran_int_max(1, n), work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+    mem_buff2 = malloc(work_size);
     if (!mem_buff2)
         goto error;
 
     work = mem_buff2;
-    iwork = work + work_size;
 
     params->WORK = work;
-    params->RWORK = NULL;
-    params->IWORK = iwork;
-    params->LWORK = work_count;
 
     return 1;
  error:
@@ -3012,106 +2991,73 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
 /**end repeat**/
 
 /**begin repeat
-   #TYPE=CFLOAT,CDOUBLE#
-   #ftyp=fortran_complex,fortran_doublecomplex#
-   #frealtyp=fortran_real,fortran_doublereal#
-   #typ=COMPLEX_t,DOUBLECOMPLEX_t#
-   #lapack_func=cgelsd,zgelsd#
+   #TYPE=CDOUBLE#
+   #lapack_func=zgeqrf#
+   #ftyp=fortran_doublecomplex#
  */
-
-static inline fortran_int
-call_@lapack_func@(GELSD_PARAMS_t *params)
-{
-    fortran_int rv;
-    LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
-                          params->A, &params->LDA,
-                          params->B, &params->LDB,
-                          params->S,
-                          params->RCOND, &params->RANK,
-                          params->WORK, &params->LWORK,
-                          params->RWORK, params->IWORK,
-                          &rv);
-    return rv;
-}
-
 static inline int
-init_@lapack_func@(GELSD_PARAMS_t *params,
+init_@lapack_func@(GEQRF_PARAMS_t *params,
                    fortran_int m,
-                   fortran_int n,
-                   fortran_int nrhs)
+                   fortran_int n)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
-    npy_uint8 *a, *b, *s, *work, *iwork, *rwork;
+    npy_uint8 *a, *tau, *work;
     fortran_int min_m_n = fortran_int_min(m, n);
-    fortran_int max_m_n = fortran_int_max(m, n);
     size_t safe_min_m_n = min_m_n;
-    size_t safe_max_m_n = max_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
-    size_t safe_nrhs = nrhs;
 
     size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
-    size_t s_size = safe_min_m_n * sizeof(@frealtyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
 
     fortran_int work_count;
-    size_t work_size, rwork_size, iwork_size;
+    size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
-    fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
 
-    mem_buff = malloc(a_size + b_size + s_size);
+    mem_buff = malloc(a_size + tau_size);
 
     if (!mem_buff)
         goto error;
 
     a = mem_buff;
-    b = a + a_size;
-    s = b + b_size;
+    tau = a + a_size;
+    memset(tau, 0, tau_size);
 
 
     params->M = m;
     params->N = n;
-    params->NRHS = nrhs;
     params->A = a;
-    params->B = b;
-    params->S = s;
+    params->TAU = tau;
     params->LDA = lda;
-    params->LDB = ldb;
 
     {
         /* compute optimal work size */
+
         @ftyp@ work_size_query;
-        @frealtyp@ rwork_size_query;
-        fortran_int iwork_size_query;
 
         params->WORK = &work_size_query;
-        params->IWORK = &iwork_size_query;
-        params->RWORK = &rwork_size_query;
         params->LWORK = -1;
 
         if (call_@lapack_func@(params) != 0)
             goto error;
 
-        work_count = (fortran_int)work_size_query.r;
+        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
 
-        work_size  = (size_t )work_size_query.r * sizeof(@ftyp@);
-        rwork_size = (size_t)rwork_size_query * sizeof(@frealtyp@);
-        iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
     }
 
-    mem_buff2 = malloc(work_size + rwork_size + iwork_size);
+    params->LWORK = fortran_int_max(fortran_int_max(1, n),
+                                    work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+
+    mem_buff2 = malloc(work_size);
     if (!mem_buff2)
         goto error;
 
     work = mem_buff2;
-    rwork = work + work_size;
-    iwork = rwork + rwork_size;
 
     params->WORK = work;
-    params->RWORK = rwork;
-    params->IWORK = iwork;
-    params->LWORK = work_count;
 
     return 1;
  error:
@@ -3125,20 +3071,11 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
 
 /**end repeat**/
 
-
 /**begin repeat
-   #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
-   #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
-   #lapack_func=sgelsd,dgelsd,cgelsd,zgelsd#
-   #dot_func=sdot,ddot,cdotc,zdotc#
-   #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
-   #basetyp = npy_float, npy_double, npy_float,  npy_double#
-   #ftyp = fortran_real, fortran_doublereal,
-           fortran_complex, fortran_doublecomplex#
-   #cmplx = 0, 0, 1, 1#
+   #lapack_func=dgeqrf,zgeqrf#
  */
 static inline void
-release_@lapack_func@(GELSD_PARAMS_t* params)
+release_@lapack_func@(GEQRF_PARAMS_t* params)
 {
     /* A and WORK contain allocated blocks */
     free(params->A);
@@ -3146,84 +3083,46 @@ release_@lapack_func@(GELSD_PARAMS_t* params)
     memset(params, 0, sizeof(*params));
 }
 
-/** Compute the squared l2 norm of a contiguous vector */
-static @basetyp@
-@TYPE@_abs2(@typ@ *p, npy_intp n) {
-    npy_intp i;
-    @basetyp@ res = 0;
-    for (i = 0; i < n; i++) {
-        @typ@ el = p[i];
-#if @cmplx@
-        res += el.real*el.real + el.imag*el.imag;
-#else
-        res += el*el;
-#endif
-    }
-    return res;
-}
+/**end repeat**/
 
+/**begin repeat
+   #TYPE=DOUBLE,CDOUBLE#
+   #REALTYPE=DOUBLE,DOUBLE#
+   #lapack_func=dgeqrf,zgeqrf#
+   #typ     = npy_double,npy_cdouble#
+   #basetyp = npy_double,npy_double#
+   #ftyp = fortran_doublereal,fortran_doublecomplex#
+   #cmplx = 0, 1#
+ */
 static void
-@TYPE@_lstsq(char **args, npy_intp const *dimensions, npy_intp const *steps,
-             void *NPY_UNUSED(func))
+@TYPE@_qr_r_raw(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func))
 {
-    GELSD_PARAMS_t params;
+    GEQRF_PARAMS_t params;
     int error_occurred = get_fp_invalid_and_clear();
-    fortran_int n, m, nrhs;
-    fortran_int excess;
+    fortran_int n, m;
 
-    INIT_OUTER_LOOP_7
+    INIT_OUTER_LOOP_2
 
     m = (fortran_int)dimensions[0];
     n = (fortran_int)dimensions[1];
-    nrhs = (fortran_int)dimensions[2];
-    excess = m - n;
 
-    if (init_@lapack_func@(&params, m, n, nrhs)) {
-        LINEARIZE_DATA_t a_in, b_in, x_out, s_out, r_out;
+    if (init_@lapack_func@(&params, m, n)) {
+        LINEARIZE_DATA_t a_in, tau_out;
 
         init_linearize_data(&a_in, n, m, steps[1], steps[0]);
-        init_linearize_data_ex(&b_in, nrhs, m, steps[3], steps[2], fortran_int_max(n, m));
-        init_linearize_data_ex(&x_out, nrhs, n, steps[5], steps[4], fortran_int_max(n, m));
-        init_linearize_data(&r_out, 1, nrhs, 1, steps[6]);
-        init_linearize_data(&s_out, 1, fortran_int_min(n, m), 1, steps[7]);
+        init_linearize_data(&tau_out, 1, fortran_int_min(m, n), 1, steps[2]);
 
-        BEGIN_OUTER_LOOP_7
+        BEGIN_OUTER_LOOP_2
             int not_ok;
             linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            linearize_@TYPE@_matrix(params.B, args[1], &b_in);
-            params.RCOND = args[2];
             not_ok = call_@lapack_func@(&params);
             if (!not_ok) {
-                delinearize_@TYPE@_matrix(args[3], params.B, &x_out);
-                *(npy_int*) args[5] = params.RANK;
-                delinearize_@REALTYPE@_matrix(args[6], params.S, &s_out);
-
-                /* Note that linalg.lstsq discards this when excess == 0 */
-                if (excess >= 0 && params.RANK == n) {
-                    /* Compute the residuals as the square sum of each column */
-                    int i;
-                    char *resid = args[4];
-                    @ftyp@ *components = (@ftyp@ *)params.B + n;
-                    for (i = 0; i < nrhs; i++) {
-                        @ftyp@ *vector = components + i*m;
-                        /* Numpy and fortran floating types are the same size,
-                         * so this cast is safe */
-                        @basetyp@ abs2 = @TYPE@_abs2((@typ@ *)vector, excess);
-                        memcpy(
-                            resid + i*r_out.column_strides,
-                            &abs2, sizeof(abs2));
-                    }
-                }
-                else {
-                    /* Note that this is always discarded by linalg.lstsq */
-                    nan_@REALTYPE@_matrix(args[4], &r_out);
-                }
+                delinearize_@TYPE@_matrix(args[0], params.A, &a_in);
+                delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_out);
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[3], &x_out);
-                nan_@REALTYPE@_matrix(args[4], &r_out);
-                *(npy_int*) args[5] = -1;
-                nan_@REALTYPE@_matrix(args[6], &s_out);
+                nan_@TYPE@_matrix(args[1], &tau_out);
             }
         END_OUTER_LOOP
 
@@ -3235,31 +3134,814 @@ static void
 
 /**end repeat**/
 
-#pragma GCC diagnostic pop
-
 /* -------------------------------------------------------------------------- */
-              /* gufunc registration  */
+                 /* qr common code (modes - reduced and complete) */ 
 
-static void *array_of_nulls[] = {
-    (void *)NULL,
-    (void *)NULL,
-    (void *)NULL,
-    (void *)NULL,
+typedef struct gqr_params_struct
+{
+    fortran_int M;
+    fortran_int MC;
+    fortran_int MN;
+    void* A;
+    void *Q;
+    fortran_int LDA;
+    void* TAU;
+    void *WORK;
+    fortran_int LWORK;
+} GQR_PARAMS_t;
 
-    (void *)NULL,
-    (void *)NULL,
-    (void *)NULL,
-    (void *)NULL,
+/**begin repeat
+   #lapack_func=dorgqr,zungqr#
+ */
 
-    (void *)NULL,
-    (void *)NULL,
-    (void *)NULL,
-    (void *)NULL,
+static inline fortran_int
+call_@lapack_func@(GQR_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->M, &params->MC, &params->MN,
+                          params->Q, &params->LDA,
+                          params->TAU,
+                          params->WORK, &params->LWORK,
+                          &rv);
+    return rv;
+}
 
-    (void *)NULL,
-    (void *)NULL,
-    (void *)NULL,
-    (void *)NULL
+/**end repeat**/
+
+/**begin repeat
+   #lapack_func=dorgqr#
+   #ftyp=fortran_doublereal#
+ */
+static inline int
+init_@lapack_func@_common(GQR_PARAMS_t *params,
+                          fortran_int m,
+                          fortran_int n,
+                          fortran_int mc)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *q, *tau, *work;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_mc = mc;
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t q_size = safe_m * safe_mc * sizeof(@ftyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    fortran_int lda = fortran_int_max(1, m);
+
+    mem_buff = malloc(q_size + tau_size + a_size);
+
+    if (!mem_buff)
+        goto error;
+
+    q = mem_buff;
+    tau = q + q_size;
+    a = tau + tau_size;
+
+
+    params->M = m;
+    params->MC = mc;
+    params->MN = min_m_n;
+    params->A = a;
+    params->Q = q;
+    params->TAU = tau;
+    params->LDA = lda;
+
+    {
+        /* compute optimal work size */
+        @ftyp@ work_size_query;
+
+        params->WORK = &work_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int) *(@ftyp@*) params->WORK;
+
+    }
+
+    params->LWORK = fortran_int_max(fortran_int_max(1, n), work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+
+    mem_buff2 = malloc(work_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+
+    params->WORK = work;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #lapack_func=zungqr#
+   #ftyp=fortran_doublecomplex#
+ */
+static inline int
+init_@lapack_func@_common(GQR_PARAMS_t *params,
+                          fortran_int m,
+                          fortran_int n,
+                          fortran_int mc)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *q, *tau, *work;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_mc = mc;
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t q_size = safe_m * safe_mc * sizeof(@ftyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    fortran_int lda = fortran_int_max(1, m);
+
+    mem_buff = malloc(q_size + tau_size + a_size);
+
+    if (!mem_buff)
+        goto error;
+
+    q = mem_buff;
+    tau = q + q_size;
+    a = tau + tau_size;
+
+
+    params->M = m;
+    params->MC = mc;
+    params->MN = min_m_n;
+    params->A = a;
+    params->Q = q;
+    params->TAU = tau;
+    params->LDA = lda;
+
+    {
+        /* compute optimal work size */
+        @ftyp@ work_size_query;
+
+        params->WORK = &work_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
+
+    }
+
+    params->LWORK = fortran_int_max(fortran_int_max(1, n),
+                                    work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+
+    mem_buff2 = malloc(work_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+
+    params->WORK = work;
+    params->LWORK = work_count;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/* -------------------------------------------------------------------------- */
+                 /* qr (modes - reduced) */
+
+
+static inline void
+dump_gqr_params(const char *name,
+                GQR_PARAMS_t *params)
+{
+    TRACE_TXT("\n%s:\n"\
+
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n",
+
+              name,
+
+              "Q", params->Q,
+              "TAU", params->TAU,
+              "WORK", params->WORK,
+
+              "M", (int)params->M,
+              "MC", (int)params->MC,
+              "MN", (int)params->MN,
+              "LDA", (int)params->LDA,
+              "LWORK", (int)params->LWORK);
+}
+
+/**begin repeat
+   #lapack_func=dorgqr,zungqr#
+   #ftyp=fortran_doublereal,fortran_doublecomplex#
+ */
+static inline int
+init_@lapack_func@(GQR_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n)
+{
+    return init_@lapack_func@_common(
+        params, m, n, 
+        fortran_int_min(m, n));
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #lapack_func=dorgqr,zungqr#
+ */
+static inline void
+release_@lapack_func@(GQR_PARAMS_t* params)
+{
+    /* A and WORK contain allocated blocks */
+    free(params->Q);
+    free(params->WORK);
+    memset(params, 0, sizeof(*params));
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=DOUBLE,CDOUBLE#
+   #REALTYPE=DOUBLE,DOUBLE#
+   #lapack_func=dorgqr,zungqr#
+   #typ     = npy_double, npy_cdouble#
+   #basetyp = npy_double, npy_double#
+   #ftyp = fortran_doublereal,fortran_doublecomplex#
+   #cmplx = 0, 1#
+ */
+static void
+@TYPE@_qr_reduced(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    GQR_PARAMS_t params;
+    int error_occurred = get_fp_invalid_and_clear();
+    fortran_int n, m;
+
+    INIT_OUTER_LOOP_3
+
+    m = (fortran_int)dimensions[0];
+    n = (fortran_int)dimensions[1];
+
+    if (init_@lapack_func@(&params, m, n)) {
+        LINEARIZE_DATA_t a_in, tau_in, q_out;
+
+        init_linearize_data(&a_in, n, m, steps[1], steps[0]);
+        init_linearize_data(&tau_in, 1, fortran_int_min(m, n), 1, steps[2]);
+        init_linearize_data(&q_out, fortran_int_min(m, n), m, steps[4], steps[3]);
+
+        BEGIN_OUTER_LOOP_3
+            int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            linearize_@TYPE@_matrix(params.Q, args[0], &a_in);
+            linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                delinearize_@TYPE@_matrix(args[2], params.Q, &q_out);
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[2], &q_out);
+            }
+        END_OUTER_LOOP
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+/**end repeat**/
+
+/* -------------------------------------------------------------------------- */
+                 /* qr (modes - complete) */
+
+/**begin repeat
+   #lapack_func=dorgqr,zungqr#
+   #ftyp=fortran_doublereal,fortran_doublecomplex#
+ */
+static inline int
+init_@lapack_func@_complete(GQR_PARAMS_t *params,
+                            fortran_int m,
+                            fortran_int n)
+{
+    return init_@lapack_func@_common(params, m, n, m);
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=DOUBLE,CDOUBLE#
+   #REALTYPE=DOUBLE,DOUBLE#
+   #lapack_func=dorgqr,zungqr#
+   #typ     = npy_double,npy_cdouble#
+   #basetyp = npy_double,npy_double#
+   #ftyp = fortran_doublereal,fortran_doublecomplex#
+   #cmplx = 0, 1#
+ */
+static void
+@TYPE@_qr_complete(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    GQR_PARAMS_t params;
+    int error_occurred = get_fp_invalid_and_clear();
+    fortran_int n, m;
+
+    INIT_OUTER_LOOP_3
+
+    m = (fortran_int)dimensions[0];
+    n = (fortran_int)dimensions[1];
+
+
+    if (init_@lapack_func@_complete(&params, m, n)) {
+        LINEARIZE_DATA_t a_in, tau_in, q_out;
+
+        init_linearize_data(&a_in, n, m, steps[1], steps[0]);
+        init_linearize_data(&tau_in, 1, fortran_int_min(m, n), 1, steps[2]);
+        init_linearize_data(&q_out, m, m, steps[4], steps[3]);
+
+        BEGIN_OUTER_LOOP_3
+            int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            linearize_@TYPE@_matrix(params.Q, args[0], &a_in);
+            linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                delinearize_@TYPE@_matrix(args[2], params.Q, &q_out);
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[2], &q_out);
+            }
+        END_OUTER_LOOP
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+/**end repeat**/
+
+/* -------------------------------------------------------------------------- */
+                 /* least squares */
+
+typedef struct gelsd_params_struct
+{
+    fortran_int M;
+    fortran_int N;
+    fortran_int NRHS;
+    void *A;
+    fortran_int LDA;
+    void *B;
+    fortran_int LDB;
+    void *S;
+    void *RCOND;
+    fortran_int RANK;
+    void *WORK;
+    fortran_int LWORK;
+    void *RWORK;
+    void *IWORK;
+} GELSD_PARAMS_t;
+
+
+static inline void
+dump_gelsd_params(const char *name,
+                  GELSD_PARAMS_t *params)
+{
+    TRACE_TXT("\n%s:\n"\
+
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+
+              "%14s: %18p\n",
+
+              name,
+
+              "A", params->A,
+              "B", params->B,
+              "S", params->S,
+              "WORK", params->WORK,
+              "RWORK", params->RWORK,
+              "IWORK", params->IWORK,
+
+              "M", (int)params->M,
+              "N", (int)params->N,
+              "NRHS", (int)params->NRHS,
+              "LDA", (int)params->LDA,
+              "LDB", (int)params->LDB,
+              "LWORK", (int)params->LWORK,
+              "RANK", (int)params->RANK,
+
+              "RCOND", params->RCOND);
+}
+
+
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE#
+   #lapack_func=sgelsd,dgelsd#
+   #ftyp=fortran_real,fortran_doublereal#
+ */
+
+static inline fortran_int
+call_@lapack_func@(GELSD_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
+                          params->A, &params->LDA,
+                          params->B, &params->LDB,
+                          params->S,
+                          params->RCOND, &params->RANK,
+                          params->WORK, &params->LWORK,
+                          params->IWORK,
+                          &rv);
+    return rv;
+}
+
+static inline int
+init_@lapack_func@(GELSD_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n,
+                   fortran_int nrhs)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *b, *s, *work, *iwork;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    fortran_int max_m_n = fortran_int_max(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_max_m_n = max_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+    size_t safe_nrhs = nrhs;
+
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
+    size_t s_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    size_t iwork_size;
+    fortran_int lda = fortran_int_max(1, m);
+    fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
+
+    mem_buff = malloc(a_size + b_size + s_size);
+
+    if (!mem_buff)
+        goto error;
+
+    a = mem_buff;
+    b = a + a_size;
+    s = b + b_size;
+
+
+    params->M = m;
+    params->N = n;
+    params->NRHS = nrhs;
+    params->A = a;
+    params->B = b;
+    params->S = s;
+    params->LDA = lda;
+    params->LDB = ldb;
+
+    {
+        /* compute optimal work size */
+        @ftyp@ work_size_query;
+        fortran_int iwork_size_query;
+
+        params->WORK = &work_size_query;
+        params->IWORK = &iwork_size_query;
+        params->RWORK = NULL;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int)work_size_query;
+
+        work_size  = (size_t) work_size_query * sizeof(@ftyp@);
+        iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
+    }
+
+    mem_buff2 = malloc(work_size + iwork_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+    iwork = work + work_size;
+
+    params->WORK = work;
+    params->RWORK = NULL;
+    params->IWORK = iwork;
+    params->LWORK = work_count;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=CFLOAT,CDOUBLE#
+   #ftyp=fortran_complex,fortran_doublecomplex#
+   #frealtyp=fortran_real,fortran_doublereal#
+   #typ=COMPLEX_t,DOUBLECOMPLEX_t#
+   #lapack_func=cgelsd,zgelsd#
+ */
+
+static inline fortran_int
+call_@lapack_func@(GELSD_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
+                          params->A, &params->LDA,
+                          params->B, &params->LDB,
+                          params->S,
+                          params->RCOND, &params->RANK,
+                          params->WORK, &params->LWORK,
+                          params->RWORK, params->IWORK,
+                          &rv);
+    return rv;
+}
+
+static inline int
+init_@lapack_func@(GELSD_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n,
+                   fortran_int nrhs)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *b, *s, *work, *iwork, *rwork;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    fortran_int max_m_n = fortran_int_max(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_max_m_n = max_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+    size_t safe_nrhs = nrhs;
+
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
+    size_t s_size = safe_min_m_n * sizeof(@frealtyp@);
+
+    fortran_int work_count;
+    size_t work_size, rwork_size, iwork_size;
+    fortran_int lda = fortran_int_max(1, m);
+    fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
+
+    mem_buff = malloc(a_size + b_size + s_size);
+
+    if (!mem_buff)
+        goto error;
+
+    a = mem_buff;
+    b = a + a_size;
+    s = b + b_size;
+
+
+    params->M = m;
+    params->N = n;
+    params->NRHS = nrhs;
+    params->A = a;
+    params->B = b;
+    params->S = s;
+    params->LDA = lda;
+    params->LDB = ldb;
+
+    {
+        /* compute optimal work size */
+        @ftyp@ work_size_query;
+        @frealtyp@ rwork_size_query;
+        fortran_int iwork_size_query;
+
+        params->WORK = &work_size_query;
+        params->IWORK = &iwork_size_query;
+        params->RWORK = &rwork_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int)work_size_query.r;
+
+        work_size  = (size_t )work_size_query.r * sizeof(@ftyp@);
+        rwork_size = (size_t)rwork_size_query * sizeof(@frealtyp@);
+        iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
+    }
+
+    mem_buff2 = malloc(work_size + rwork_size + iwork_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+    rwork = work + work_size;
+    iwork = rwork + rwork_size;
+
+    params->WORK = work;
+    params->RWORK = rwork;
+    params->IWORK = iwork;
+    params->LWORK = work_count;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
+   #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
+   #lapack_func=sgelsd,dgelsd,cgelsd,zgelsd#
+   #dot_func=sdot,ddot,cdotc,zdotc#
+   #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
+   #basetyp = npy_float, npy_double, npy_float,  npy_double#
+   #ftyp = fortran_real, fortran_doublereal,
+           fortran_complex, fortran_doublecomplex#
+   #cmplx = 0, 0, 1, 1#
+ */
+static inline void
+release_@lapack_func@(GELSD_PARAMS_t* params)
+{
+    /* A and WORK contain allocated blocks */
+    free(params->A);
+    free(params->WORK);
+    memset(params, 0, sizeof(*params));
+}
+
+/** Compute the squared l2 norm of a contiguous vector */
+static @basetyp@
+@TYPE@_abs2(@typ@ *p, npy_intp n) {
+    npy_intp i;
+    @basetyp@ res = 0;
+    for (i = 0; i < n; i++) {
+        @typ@ el = p[i];
+#if @cmplx@
+        res += el.real*el.real + el.imag*el.imag;
+#else
+        res += el*el;
+#endif
+    }
+    return res;
+}
+
+static void
+@TYPE@_lstsq(char **args, npy_intp const *dimensions, npy_intp const *steps,
+             void *NPY_UNUSED(func))
+{
+    GELSD_PARAMS_t params;
+    int error_occurred = get_fp_invalid_and_clear();
+    fortran_int n, m, nrhs;
+    fortran_int excess;
+
+    INIT_OUTER_LOOP_7
+
+    m = (fortran_int)dimensions[0];
+    n = (fortran_int)dimensions[1];
+    nrhs = (fortran_int)dimensions[2];
+    excess = m - n;
+
+    if (init_@lapack_func@(&params, m, n, nrhs)) {
+        LINEARIZE_DATA_t a_in, b_in, x_out, s_out, r_out;
+
+        init_linearize_data(&a_in, n, m, steps[1], steps[0]);
+        init_linearize_data_ex(&b_in, nrhs, m, steps[3], steps[2], fortran_int_max(n, m));
+        init_linearize_data_ex(&x_out, nrhs, n, steps[5], steps[4], fortran_int_max(n, m));
+        init_linearize_data(&r_out, 1, nrhs, 1, steps[6]);
+        init_linearize_data(&s_out, 1, fortran_int_min(n, m), 1, steps[7]);
+
+        BEGIN_OUTER_LOOP_7
+            int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            linearize_@TYPE@_matrix(params.B, args[1], &b_in);
+            params.RCOND = args[2];
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                delinearize_@TYPE@_matrix(args[3], params.B, &x_out);
+                *(npy_int*) args[5] = params.RANK;
+                delinearize_@REALTYPE@_matrix(args[6], params.S, &s_out);
+
+                /* Note that linalg.lstsq discards this when excess == 0 */
+                if (excess >= 0 && params.RANK == n) {
+                    /* Compute the residuals as the square sum of each column */
+                    int i;
+                    char *resid = args[4];
+                    @ftyp@ *components = (@ftyp@ *)params.B + n;
+                    for (i = 0; i < nrhs; i++) {
+                        @ftyp@ *vector = components + i*m;
+                        /* Numpy and fortran floating types are the same size,
+                         * so this cast is safe */
+                        @basetyp@ abs2 = @TYPE@_abs2((@typ@ *)vector, excess);
+                        memcpy(
+                            resid + i*r_out.column_strides,
+                            &abs2, sizeof(abs2));
+                    }
+                }
+                else {
+                    /* Note that this is always discarded by linalg.lstsq */
+                    nan_@REALTYPE@_matrix(args[4], &r_out);
+                }
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[3], &x_out);
+                nan_@REALTYPE@_matrix(args[4], &r_out);
+                *(npy_int*) args[5] = -1;
+                nan_@REALTYPE@_matrix(args[6], &s_out);
+            }
+        END_OUTER_LOOP
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+/**end repeat**/
+
+#pragma GCC diagnostic pop
+
+/* -------------------------------------------------------------------------- */
+              /* gufunc registration  */
+
+static void *array_of_nulls[] = {
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL,
+
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL,
+
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL,
+
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL
 };
 
 #define FUNC_ARRAY_NAME(NAME) NAME ## _funcs
@@ -3291,6 +3973,17 @@ static void *array_of_nulls[] = {
         CDOUBLE_ ## NAME                                \
     }
 
+/* The single precision functions are not used at all,
+ * due to input data being promoted to double precision
+ * in Python, so they are not implemented here.
+ */
+#define GUFUNC_FUNC_ARRAY_QR(NAME)                      \
+    static PyUFuncGenericFunction                       \
+    FUNC_ARRAY_NAME(NAME)[] = {                         \
+        DOUBLE_ ## NAME,                                \
+        CDOUBLE_ ## NAME                                \
+    }
+
 
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(slogdet);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(det);
@@ -3305,6 +3998,9 @@ GUFUNC_FUNC_ARRAY_REAL_COMPLEX(cholesky_lo);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_N);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_S);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A);
+GUFUNC_FUNC_ARRAY_QR(qr_r_raw);
+GUFUNC_FUNC_ARRAY_QR(qr_reduced);
+GUFUNC_FUNC_ARRAY_QR(qr_complete);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(lstsq);
 GUFUNC_FUNC_ARRAY_EIG(eig);
 GUFUNC_FUNC_ARRAY_EIG(eigvals);
@@ -3371,6 +4067,24 @@ static char svd_1_3_types[] = {
     NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE
 };
 
+/* A, tau */
+static char qr_r_raw_types[] = {
+    NPY_DOUBLE,  NPY_DOUBLE,
+    NPY_CDOUBLE, NPY_CDOUBLE,
+};
+
+/* A, tau, q */
+static char qr_reduced_types[] = {
+    NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE,
+    NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
+};
+
+/* A, tau, q */
+static char qr_complete_types[] = {
+    NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE,
+    NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
+};
+
 /*  A,           b,           rcond,      x,           resid,      rank,    s,        */
 static char lstsq_types[] = {
     NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,  NPY_FLOAT,   NPY_FLOAT,  NPY_INT, NPY_FLOAT,
@@ -3570,6 +4284,42 @@ GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
         FUNC_ARRAY_NAME(eigvals),
         eigvals_types
     },
+    {
+        "qr_r_raw_m",
+        "(m,n)->(m)",
+        "Compute TAU vector for the last two dimensions \n"\
+        "and broadcast to the rest. For m <= n. \n",
+        2, 1, 1,
+        FUNC_ARRAY_NAME(qr_r_raw),
+        qr_r_raw_types
+    },
+    {
+        "qr_r_raw_n",
+        "(m,n)->(n)",
+        "Compute TAU vector for the last two dimensions \n"\
+        "and broadcast to the rest. For m > n. \n",
+        2, 1, 1,
+        FUNC_ARRAY_NAME(qr_r_raw),
+        qr_r_raw_types
+    },
+    {
+        "qr_reduced",
+        "(m,n),(k)->(m,k)",
+        "Compute Q matrix for the last two dimensions \n"\
+        "and broadcast to the rest. \n",
+        2, 2, 1,
+        FUNC_ARRAY_NAME(qr_reduced),
+        qr_reduced_types
+    },
+    {
+        "qr_complete",
+        "(m,n),(n)->(m,m)",
+        "Compute Q matrix for the last two dimensions \n"\
+        "and broadcast to the rest. For m > n. \n",
+        2, 2, 1,
+        FUNC_ARRAY_NAME(qr_complete),
+        qr_complete_types
+    },
     {
         "lstsq_m",
         "(m,n),(m,nrhs),()->(n,nrhs),(nrhs),(),(m)",