From 5ca91c47389cc0bb8ca047421e5aca802d5745c3 Mon Sep 17 00:00:00 2001
From: Simon Davy <simon.davy@canonical.com>
Date: Fri, 13 Dec 2019 18:00:53 +0000
Subject: [PATCH] Better gunicorn timeout handling.

By raising our own exception in Gunicorn's worker_abort handler, we can
catch and handle it just like any other error. This means we get an
error response, logging, and sentry report for the timeout, which is
a massive improvement.

Also includes a few of drive-by tweaks - disable parallel pytest runs in
tox, as it generally confuses things, tweak the error message for
assert_log help to reduce noise, and slight tweaks to the title of the
default error pages.
---
 Makefile                 |   3 +-
 talisker/gunicorn.py     |  30 ++++++++++--
 talisker/testing.py      |  19 +++++---
 talisker/wsgi.py         |  16 ++++--
 tests/test_functional.py |  30 ++++++++++++
 tests/test_gunicorn.py   |   7 ++-
 tests/test_testing.py    |  11 +++--
 tests/test_wsgi.py       | 102 +++++++++++++++++++++++++--------------
 tests/wsgi_app.py        |  20 ++++++++
 tox.ini                  |   2 +-
 10 files changed, 182 insertions(+), 58 deletions(-)

diff --git a/Makefile b/Makefile
index e9d27ff9..726dc5be 100644
--- a/Makefile
+++ b/Makefile
@@ -59,8 +59,9 @@ export DEVEL=1
 WORKER ?= sync
 PORT ?= 8000
 TALISKER = $(BIN)/talisker.gunicorn --bind 0.0.0.0:$(PORT) --reload --worker-class $(WORKER) $(ARGS)
+APP ?= application
 run wsgi:
-	$(TALISKER) tests.wsgi_app:application
+	$(TALISKER) tests.wsgi_app:$(APP)
 
 run_multiprocess: ARGS=-w4
 run_multiprocess: run
diff --git a/talisker/gunicorn.py b/talisker/gunicorn.py
index b27d24fe..262495ea 100644
--- a/talisker/gunicorn.py
+++ b/talisker/gunicorn.py
@@ -31,6 +31,7 @@
 
 from collections import deque
 import logging
+import sys
 
 from gunicorn.glogging import Logger
 from gunicorn.app.wsgiapp import WSGIApplication
@@ -92,7 +93,7 @@ def handle_custom():
 
 
 def gunicorn_on_starting(arbiter):
-    """Gunicorn on_starging server hook.
+    """Gunicorn on_starting server hook.
 
     Sets up the fake signal and handler on the arbiter instance.
     """
@@ -112,11 +113,33 @@ def gunicorn_child_exit(server, worker):
         server.SIG_QUEUE.append('SIGCUSTOM')
 
 
+def gunicorn_worker_abort(worker):
+    """Worker SIGABRT handler function.
+
+    SIGABRT is only used by gunicorn on worker timeout. We raise a custom
+    exception, rather than falling through to the default of SystemExit, which
+    is easier to catch and handle as a normal error by our WSGI middleware.
+    """
+    raise talisker.wsgi.RequestTimeout(
+        'gunicorn worker timeout (pid: {})'.format(worker.pid)
+    )
+
+
 def gunicorn_worker_exit(server, worker):
-    """Logs any requests that are still in flight."""
+    """Worker exit function.
+
+    Last chance to try log any outstanding requests before we die.
+    """
     for rid in list(talisker.wsgi.REQUESTS):
         request = talisker.wsgi.REQUESTS[rid]
-        request.finish_request(timeout=True)
+        try:
+            raise talisker.wsgi.RequestTimeout(
+                'finish processing ongoing requests on worker exit '
+                '(pid: {})'.format(worker.pid)
+            )
+        except talisker.wsgi.RequestTimeout:
+            request.exc_info = sys.exc_info()
+            request.finish_request(timeout=True)
 
 
 class GunicornLogger(Logger):
@@ -153,6 +176,7 @@ def init(self, parser, opts, args):
 
         cfg['logger_class'] = GunicornLogger
         cfg['worker_exit'] = gunicorn_worker_exit
+        cfg['worker_abort'] = gunicorn_worker_abort
 
         # only enable these if we are doing multiproc cleanup
         if talisker.prometheus_multiproc_cleanup:
diff --git a/talisker/testing.py b/talisker/testing.py
index 51b9d841..b85e335b 100644
--- a/talisker/testing.py
+++ b/talisker/testing.py
@@ -172,17 +172,20 @@ def assert_log(self, **kwargs):
         if not self.exists(**kwargs):
             # evaluate each term independently to narrow down culprit
             terms = []
+            extra = kwargs.pop('extra', {})
             for kw, value in kwargs.items():
-                num = len(self.filter(**{kw: value}))
-                terms.append((num, kw, value))
-            terms.sort()  # 0 matches go first, as likely to be the issue
+                if len(self.filter(**{kw: value})) == 0:
+                    terms.append((kw, value))
+            for kw, value in extra.items():
+                if len(self.filter(**{'extra': {kw: value}})) == 0:
+                    terms.append(('extra["' + kw + '"]', value))
 
-            desc = '\n    '.join(
-                '{1}={2!r} ({0} matches)'.format(*t) for t in terms
-            )
+            desc = '\n'.join('    {}={}'.format(k, v) for k, v in terms)
             raise AssertionError(
-                'Could not find log out of {} logs:\n    {}'.format(
-                    len(self), desc))
+                'Could not find log out of {} logs.\n'
+                'Search terms that could not be found:\n'
+                '{}'.format(len(self), desc)
+            )
 
     def assert_not_log(self, **kwargs):
         if self.exists(**kwargs):
diff --git a/talisker/wsgi.py b/talisker/wsgi.py
index ac400abd..23d39135 100644
--- a/talisker/wsgi.py
+++ b/talisker/wsgi.py
@@ -65,6 +65,10 @@
 REQUESTS = {}
 
 
+class RequestTimeout(Exception):
+    pass
+
+
 def talisker_error_response(environ, headers, exc_info):
     """Returns WSGI iterable to be returned as an error response.
 
@@ -89,11 +93,11 @@ def talisker_error_response(environ, headers, exc_info):
             wsgi_environ.append((k, v))
 
     if config.devel:
-        title = 'Request {}: {}'.format(rid, exc)
+        title = '{}: {}'.format(exc_type.__name__, exc)
         lines = traceback.format_exception(*exc_info)
         tb = PreformattedText(''.join(lines), id='traceback')
     else:
-        title = 'Request {}: {}'.format(rid, exc_type.__name__)
+        title = 'Server Error: {}'.format(exc_type.__name__)
 
     content = [
         Content(title, tag='h1', id='title'),
@@ -302,8 +306,10 @@ def __next__(self):
                 if not self.start_response_called:
                     self.call_start_response()
                 raise
-            except Exception:
+            except Exception as e:
                 self.exc_info = sys.exc_info()
+                if isinstance(e, RequestTimeout):
+                    self.timedout = True
                 # switch to generating an error response
                 self.iter = iter(self.error(self.exc_info))
                 chunk = next(self.iter)
@@ -583,9 +589,11 @@ def __call__(self, environ, start_response):
 
         try:
             response_iter = self.app(environ, request.start_response)
-        except Exception:
+        except Exception as e:
             # store details for later
             request.exc_info = sys.exc_info()
+            if isinstance(e, RequestTimeout):
+                request.timedout = True
             # switch to generating an error response
             response_iter = request.error(request.exc_info)
         except SystemExit as e:
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 41c4bc74..1de50cab 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -116,6 +116,36 @@ def test_django_app(django):
     assert response.headers['X-View-Name'] == 'django_app.views.index'
 
 
+@require_module('gunicorn')
+def test_gunicorn_timeout(tmpdir):
+
+    def test_app():
+        import time
+
+        def app(environ, start_response):
+            start_response('200 OK', [('content-type', 'text/plain')])
+            time.sleep(100)
+            return []
+
+    app_module = str(tmpdir / 'app.py')
+    with open(app_module, 'w') as f:
+        f.write(get_function_body(test_app))
+
+    # ensure devel mode
+    env = os.environ.copy()
+    env['DEVEL'] = '1'
+    p = GunicornProcess('app:app', args=['-t1'], cwd=str(tmpdir), env=env)
+    with p:
+        response = requests.get(
+            p.url('/'),
+            headers={'Accept': 'application/json'},
+        ).json()
+
+    assert response['title'].startswith(
+        'RequestTimeout: gunicorn worker timeout (pid:'
+    )
+
+
 @require_module('celery')
 def test_celery_basic(celery_signals):
     from tests.celery_app import basic_task, error_task, propagate_task
diff --git a/tests/test_gunicorn.py b/tests/test_gunicorn.py
index e111cda4..5064bd88 100644
--- a/tests/test_gunicorn.py
+++ b/tests/test_gunicorn.py
@@ -264,7 +264,10 @@ def test_gunicorn_worker_exit(wsgi_env, context):
     request = talisker.wsgi.TaliskerWSGIRequest(wsgi_env, None, [])
     talisker.wsgi.REQUESTS['ID'] = request
 
-    gunicorn.gunicorn_worker_exit(None, None)
+    class worker:
+        pid = 100
+
+    gunicorn.gunicorn_worker_exit(None, worker())
 
     context.assert_log(
         name='talisker.wsgi',
@@ -274,3 +277,5 @@ def test_gunicorn_worker_exit(wsgi_env, context):
             'timeout': True,
         },
     )
+
+    assert len(context.sentry) == 1
diff --git a/tests/test_testing.py b/tests/test_testing.py
index 12cbb532..3f34e2d8 100644
--- a/tests/test_testing.py
+++ b/tests/test_testing.py
@@ -115,13 +115,14 @@ def test_test_context():
         name=logger.name, msg='bar', level='warning', extra={'b': 2})
 
     with pytest.raises(AssertionError) as exc:
-        ctx.assert_log(name=logger.name, msg='XXX', level='info')
+        ctx.assert_log(
+            name=logger.name, msg='XXX', level='info', extra={"baz": 1})
 
     assert str(exc.value) == textwrap.dedent("""
-        Could not find log out of {0} logs:
-            msg={1}'XXX' (0 matches)
-            level={1}'info' (2 matches)
-            name='test_test_context' (2 matches)
+        Could not find log out of {0} logs.
+        Search terms that could not be found:
+            msg=XXX
+            extra["baz"]=1
     """).strip().format(len(ctx.logs), 'u' if sys.version_info[0] == 2 else '')
 
     with pytest.raises(AssertionError) as exc:
diff --git a/tests/test_wsgi.py b/tests/test_wsgi.py
index bbd876ff..2618ebc5 100644
--- a/tests/test_wsgi.py
+++ b/tests/test_wsgi.py
@@ -87,7 +87,8 @@ def run(env=None, status='200 OK', headers=None, body=None, duration=1):
     return run
 
 
-def test_error_response_handler(wsgi_env):
+def test_error_response_handler(wsgi_env, config):
+    config['DEVEL'] = 0
     wsgi_env['REQUEST_ID'] = 'REQUESTID'
     wsgi_env['HTTP_ACCEPT'] = 'application/json'
     headers = [('X-VCS-Revision', 'revid')]
@@ -105,7 +106,7 @@ def test_error_response_handler(wsgi_env):
     )
     error = json.loads(body.decode('utf8'))
     assert content_type == 'application/json'
-    assert error['title'] == 'Request REQUESTID: Exception'
+    assert error['title'] == 'Server Error: Exception'
     assert error['id'] == {'Request-Id': 'REQUESTID'}
     assert error['traceback'] == '[traceback hidden]'
     assert error['request_headers'] == {
@@ -136,7 +137,7 @@ def test_error_response_handler_devel(wsgi_env, config):
         exc_info,
     )
     error = json.loads(body.decode('utf8'))
-    assert error['title'] == 'Request REQUESTID: test'
+    assert error['title'] == 'Exception: test'
     assert error['traceback'][0] == 'Traceback (most recent call last):'
     assert error['traceback'][-3] == '    raise Exception(\'test\')'
     assert error['traceback'][-2] == 'Exception: test'
@@ -216,7 +217,11 @@ def test_wsgi_request_wrap_file(run_wsgi, context, tmpdir):
     )
 
 
-def test_wsgi_request_wrap_error(run_wsgi, context):
+@pytest.mark.parametrize('exc_type', [
+    Exception,
+    wsgi.RequestTimeout,
+])
+def test_wsgi_request_wrap_error_in_iterator(exc_type, run_wsgi, context):
     env = {
         'REQUEST_ID': 'REQUESTID',
         'HTTP_ACCEPT': 'application/json',
@@ -227,27 +232,31 @@ def __iter__(self):
             return self
 
         def __next__(self):
-            raise Exception('error')
+            raise exc_type('error')
 
     headers, body = run_wsgi(env, body=ErrorGenerator())
     output = b''.join(body)
     error = json.loads(output.decode('utf8'))
 
-    assert error['title'] == 'Request REQUESTID: Exception'
+    assert error['title'] == 'Server Error: ' + exc_type.__name__
 
-    context.assert_log(
-        msg='GET /',
-        extra=dict([
-            ('method', 'GET'),
-            ('path', '/'),
-            ('status', 500),
-            ('duration_ms', 1000.0),
-            ('ip', '127.0.0.1'),
-            ('proto', 'HTTP/1.0'),
-            ('length', len(output)),
-            ('exc_type', 'Exception'),
-        ]),
-    )
+    extra = dict([
+        ('method', 'GET'),
+        ('path', '/'),
+        ('status', 500),
+        ('duration_ms', 1000.0),
+        ('ip', '127.0.0.1'),
+        ('proto', 'HTTP/1.0'),
+        ('length', len(output)),
+        ('exc_type', exc_type.__name__),
+    ])
+    if exc_type is wsgi.RequestTimeout:
+        extra['timeout'] = True
+
+    context.assert_log(msg='GET /', extra=extra)
+
+    # check it sent a sentry message
+    assert len(context.sentry) == 1
 
 
 def test_wsgi_request_wrap_error_headers_sent(run_wsgi, context):
@@ -445,11 +454,15 @@ def app(environ, _start_response):
     assert contexts[0].deadline == datetime_to_timestamp(ts)
 
 
+@pytest.mark.parametrize('exc_type', [
+    Exception,
+    wsgi.RequestTimeout,
+])
 def test_middleware_error_before_start_response(
-        wsgi_env, start_response, context):
+        exc_type, wsgi_env, start_response, context):
 
     def app(environ, _start_response):
-        raise Exception('error')
+        raise exc_type('error')
 
     extra_env = {'ENV': 'VALUE'}
     extra_headers = {'Some-Header': 'value'}
@@ -460,33 +473,44 @@ def app(environ, _start_response):
     output = b''.join(mw(wsgi_env, start_response))
     error = json.loads(output.decode('utf8'))
 
-    assert error['title'] == 'Request ID: Exception'
+    assert error['title'] == 'Server Error: ' + exc_type.__name__
     assert wsgi_env['ENV'] == 'VALUE'
     assert wsgi_env['REQUEST_ID'] == 'ID'
-    assert start_response.status == '500 Internal Server Error'
-    assert start_response.exc_info[0] is Exception
+    assert start_response.exc_info[0] is exc_type
     assert start_response.headers[:3] == [
         ('Content-Type', 'application/json'),
         ('Some-Header', 'value'),
         ('X-Request-Id', 'ID'),
     ]
 
+    extra = {
+        'status': 500,
+        'exc_type': exc_type.__name__,
+    }
+
+    if exc_type is wsgi.RequestTimeout:
+        extra['timeout'] = True
+
     context.assert_log(
         name='talisker.wsgi',
         msg='GET /',
-        extra={
-            'status': 500,
-            'exc_type': 'Exception',
-        },
+        extra=extra,
     )
 
+    # check it sent a sentry message
+    assert len(context.sentry) == 1
 
+
+@pytest.mark.parametrize('exc_type', [
+    Exception,
+    wsgi.RequestTimeout,
+])
 def test_middleware_error_after_start_response(
-        wsgi_env, start_response, context):
+        exc_type, wsgi_env, start_response, context):
 
     def app(wsgi_env, _start_response):
         _start_response('200 OK', [('Content-Type', 'application/json')])
-        raise Exception('error')
+        raise exc_type('error')
 
     extra_env = {'ENV': 'VALUE'}
     extra_headers = {'Some-Header': 'value'}
@@ -497,7 +521,7 @@ def app(wsgi_env, _start_response):
     output = b''.join(mw(wsgi_env, start_response))
     error = json.loads(output.decode('utf8'))
 
-    assert error['title'] == 'Request ID: Exception'
+    assert error['title'] == 'Server Error: ' + exc_type.__name__
     assert wsgi_env['ENV'] == 'VALUE'
     assert wsgi_env['REQUEST_ID'] == 'ID'
     assert start_response.status == '500 Internal Server Error'
@@ -507,15 +531,23 @@ def app(wsgi_env, _start_response):
         ('X-Request-Id', 'ID'),
     ]
 
+    extra = {
+        'status': 500,
+        'exc_type': exc_type.__name__,
+    }
+
+    if exc_type is wsgi.RequestTimeout:
+        extra['timeout'] = True
+
     context.assert_log(
         name='talisker.wsgi',
         msg='GET /',
-        extra={
-            'status': 500,
-            'exc_type': 'Exception',
-        },
+        extra=extra,
     )
 
+    # check it sent a sentry message
+    assert len(context.sentry) == 1
+
 
 def test_middleware_preserves_file_wrapper(
         wsgi_env, start_response, context, tmpdir):
diff --git a/tests/wsgi_app.py b/tests/wsgi_app.py
index 4b552d74..c12ad242 100644
--- a/tests/wsgi_app.py
+++ b/tests/wsgi_app.py
@@ -29,6 +29,7 @@
 
 import pprint
 import logging
+import time
 
 
 def application(environ, start_response):
@@ -44,6 +45,7 @@ def application(environ, start_response):
     logger.warning('warning')
     logger.error('error')
     logger.critical('critical')
+    time.sleep(10)
     return [output.encode('utf8')]
 
 
@@ -54,3 +56,21 @@ def app404(environ, start_response):
     else:
         start_response('404 Not Found', [])
         return [b'Not Found']
+
+
+def timeout(environ, start_response):
+    time.sleep(1000)
+
+
+def timeout2(environ, start_response):
+    start_response('200 OK', [('content-type', 'text/plain')])
+    time.sleep(1000)
+
+
+def timeout3(environ, start_response):
+    start_response('200 OK', [('content-type', 'text/plain')])
+
+    def i():
+        yield time.sleep(1000)
+
+    return i()
diff --git a/tox.ini b/tox.ini
index 071e323c..d5b04f43 100644
--- a/tox.ini
+++ b/tox.ini
@@ -7,7 +7,7 @@ skipsdist = True
 passenv = TRAVIS TRAVIS_JOB_ID TRAVIS_BRANCH
 usedevelop = True
 deps = -r{toxinidir}/requirements.tests.txt
-commands = py.test -n auto --cov=talisker --no-success-flaky-report
+commands = py.test -v --cov=talisker --no-success-flaky-report
 extras =
     gunicorn
     raven