From 5ca91c47389cc0bb8ca047421e5aca802d5745c3 Mon Sep 17 00:00:00 2001 From: Simon Davy Date: Fri, 13 Dec 2019 18:00:53 +0000 Subject: [PATCH] Better gunicorn timeout handling. By raising our own exception in Gunicorn's worker_abort handler, we can catch and handle it just like any other error. This means we get an error response, logging, and sentry report for the timeout, which is a massive improvement. Also includes a few of drive-by tweaks - disable parallel pytest runs in tox, as it generally confuses things, tweak the error message for assert_log help to reduce noise, and slight tweaks to the title of the default error pages. --- Makefile | 3 +- talisker/gunicorn.py | 30 ++++++++++-- talisker/testing.py | 19 +++++--- talisker/wsgi.py | 16 ++++-- tests/test_functional.py | 30 ++++++++++++ tests/test_gunicorn.py | 7 ++- tests/test_testing.py | 11 +++-- tests/test_wsgi.py | 102 +++++++++++++++++++++++++-------------- tests/wsgi_app.py | 20 ++++++++ tox.ini | 2 +- 10 files changed, 182 insertions(+), 58 deletions(-) diff --git a/Makefile b/Makefile index e9d27ff9..726dc5be 100644 --- a/Makefile +++ b/Makefile @@ -59,8 +59,9 @@ export DEVEL=1 WORKER ?= sync PORT ?= 8000 TALISKER = $(BIN)/talisker.gunicorn --bind 0.0.0.0:$(PORT) --reload --worker-class $(WORKER) $(ARGS) +APP ?= application run wsgi: - $(TALISKER) tests.wsgi_app:application + $(TALISKER) tests.wsgi_app:$(APP) run_multiprocess: ARGS=-w4 run_multiprocess: run diff --git a/talisker/gunicorn.py b/talisker/gunicorn.py index b27d24fe..262495ea 100644 --- a/talisker/gunicorn.py +++ b/talisker/gunicorn.py @@ -31,6 +31,7 @@ from collections import deque import logging +import sys from gunicorn.glogging import Logger from gunicorn.app.wsgiapp import WSGIApplication @@ -92,7 +93,7 @@ def handle_custom(): def gunicorn_on_starting(arbiter): - """Gunicorn on_starging server hook. + """Gunicorn on_starting server hook. Sets up the fake signal and handler on the arbiter instance. """ @@ -112,11 +113,33 @@ def gunicorn_child_exit(server, worker): server.SIG_QUEUE.append('SIGCUSTOM') +def gunicorn_worker_abort(worker): + """Worker SIGABRT handler function. + + SIGABRT is only used by gunicorn on worker timeout. We raise a custom + exception, rather than falling through to the default of SystemExit, which + is easier to catch and handle as a normal error by our WSGI middleware. + """ + raise talisker.wsgi.RequestTimeout( + 'gunicorn worker timeout (pid: {})'.format(worker.pid) + ) + + def gunicorn_worker_exit(server, worker): - """Logs any requests that are still in flight.""" + """Worker exit function. + + Last chance to try log any outstanding requests before we die. + """ for rid in list(talisker.wsgi.REQUESTS): request = talisker.wsgi.REQUESTS[rid] - request.finish_request(timeout=True) + try: + raise talisker.wsgi.RequestTimeout( + 'finish processing ongoing requests on worker exit ' + '(pid: {})'.format(worker.pid) + ) + except talisker.wsgi.RequestTimeout: + request.exc_info = sys.exc_info() + request.finish_request(timeout=True) class GunicornLogger(Logger): @@ -153,6 +176,7 @@ def init(self, parser, opts, args): cfg['logger_class'] = GunicornLogger cfg['worker_exit'] = gunicorn_worker_exit + cfg['worker_abort'] = gunicorn_worker_abort # only enable these if we are doing multiproc cleanup if talisker.prometheus_multiproc_cleanup: diff --git a/talisker/testing.py b/talisker/testing.py index 51b9d841..b85e335b 100644 --- a/talisker/testing.py +++ b/talisker/testing.py @@ -172,17 +172,20 @@ def assert_log(self, **kwargs): if not self.exists(**kwargs): # evaluate each term independently to narrow down culprit terms = [] + extra = kwargs.pop('extra', {}) for kw, value in kwargs.items(): - num = len(self.filter(**{kw: value})) - terms.append((num, kw, value)) - terms.sort() # 0 matches go first, as likely to be the issue + if len(self.filter(**{kw: value})) == 0: + terms.append((kw, value)) + for kw, value in extra.items(): + if len(self.filter(**{'extra': {kw: value}})) == 0: + terms.append(('extra["' + kw + '"]', value)) - desc = '\n '.join( - '{1}={2!r} ({0} matches)'.format(*t) for t in terms - ) + desc = '\n'.join(' {}={}'.format(k, v) for k, v in terms) raise AssertionError( - 'Could not find log out of {} logs:\n {}'.format( - len(self), desc)) + 'Could not find log out of {} logs.\n' + 'Search terms that could not be found:\n' + '{}'.format(len(self), desc) + ) def assert_not_log(self, **kwargs): if self.exists(**kwargs): diff --git a/talisker/wsgi.py b/talisker/wsgi.py index ac400abd..23d39135 100644 --- a/talisker/wsgi.py +++ b/talisker/wsgi.py @@ -65,6 +65,10 @@ REQUESTS = {} +class RequestTimeout(Exception): + pass + + def talisker_error_response(environ, headers, exc_info): """Returns WSGI iterable to be returned as an error response. @@ -89,11 +93,11 @@ def talisker_error_response(environ, headers, exc_info): wsgi_environ.append((k, v)) if config.devel: - title = 'Request {}: {}'.format(rid, exc) + title = '{}: {}'.format(exc_type.__name__, exc) lines = traceback.format_exception(*exc_info) tb = PreformattedText(''.join(lines), id='traceback') else: - title = 'Request {}: {}'.format(rid, exc_type.__name__) + title = 'Server Error: {}'.format(exc_type.__name__) content = [ Content(title, tag='h1', id='title'), @@ -302,8 +306,10 @@ def __next__(self): if not self.start_response_called: self.call_start_response() raise - except Exception: + except Exception as e: self.exc_info = sys.exc_info() + if isinstance(e, RequestTimeout): + self.timedout = True # switch to generating an error response self.iter = iter(self.error(self.exc_info)) chunk = next(self.iter) @@ -583,9 +589,11 @@ def __call__(self, environ, start_response): try: response_iter = self.app(environ, request.start_response) - except Exception: + except Exception as e: # store details for later request.exc_info = sys.exc_info() + if isinstance(e, RequestTimeout): + request.timedout = True # switch to generating an error response response_iter = request.error(request.exc_info) except SystemExit as e: diff --git a/tests/test_functional.py b/tests/test_functional.py index 41c4bc74..1de50cab 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -116,6 +116,36 @@ def test_django_app(django): assert response.headers['X-View-Name'] == 'django_app.views.index' +@require_module('gunicorn') +def test_gunicorn_timeout(tmpdir): + + def test_app(): + import time + + def app(environ, start_response): + start_response('200 OK', [('content-type', 'text/plain')]) + time.sleep(100) + return [] + + app_module = str(tmpdir / 'app.py') + with open(app_module, 'w') as f: + f.write(get_function_body(test_app)) + + # ensure devel mode + env = os.environ.copy() + env['DEVEL'] = '1' + p = GunicornProcess('app:app', args=['-t1'], cwd=str(tmpdir), env=env) + with p: + response = requests.get( + p.url('/'), + headers={'Accept': 'application/json'}, + ).json() + + assert response['title'].startswith( + 'RequestTimeout: gunicorn worker timeout (pid:' + ) + + @require_module('celery') def test_celery_basic(celery_signals): from tests.celery_app import basic_task, error_task, propagate_task diff --git a/tests/test_gunicorn.py b/tests/test_gunicorn.py index e111cda4..5064bd88 100644 --- a/tests/test_gunicorn.py +++ b/tests/test_gunicorn.py @@ -264,7 +264,10 @@ def test_gunicorn_worker_exit(wsgi_env, context): request = talisker.wsgi.TaliskerWSGIRequest(wsgi_env, None, []) talisker.wsgi.REQUESTS['ID'] = request - gunicorn.gunicorn_worker_exit(None, None) + class worker: + pid = 100 + + gunicorn.gunicorn_worker_exit(None, worker()) context.assert_log( name='talisker.wsgi', @@ -274,3 +277,5 @@ def test_gunicorn_worker_exit(wsgi_env, context): 'timeout': True, }, ) + + assert len(context.sentry) == 1 diff --git a/tests/test_testing.py b/tests/test_testing.py index 12cbb532..3f34e2d8 100644 --- a/tests/test_testing.py +++ b/tests/test_testing.py @@ -115,13 +115,14 @@ def test_test_context(): name=logger.name, msg='bar', level='warning', extra={'b': 2}) with pytest.raises(AssertionError) as exc: - ctx.assert_log(name=logger.name, msg='XXX', level='info') + ctx.assert_log( + name=logger.name, msg='XXX', level='info', extra={"baz": 1}) assert str(exc.value) == textwrap.dedent(""" - Could not find log out of {0} logs: - msg={1}'XXX' (0 matches) - level={1}'info' (2 matches) - name='test_test_context' (2 matches) + Could not find log out of {0} logs. + Search terms that could not be found: + msg=XXX + extra["baz"]=1 """).strip().format(len(ctx.logs), 'u' if sys.version_info[0] == 2 else '') with pytest.raises(AssertionError) as exc: diff --git a/tests/test_wsgi.py b/tests/test_wsgi.py index bbd876ff..2618ebc5 100644 --- a/tests/test_wsgi.py +++ b/tests/test_wsgi.py @@ -87,7 +87,8 @@ def run(env=None, status='200 OK', headers=None, body=None, duration=1): return run -def test_error_response_handler(wsgi_env): +def test_error_response_handler(wsgi_env, config): + config['DEVEL'] = 0 wsgi_env['REQUEST_ID'] = 'REQUESTID' wsgi_env['HTTP_ACCEPT'] = 'application/json' headers = [('X-VCS-Revision', 'revid')] @@ -105,7 +106,7 @@ def test_error_response_handler(wsgi_env): ) error = json.loads(body.decode('utf8')) assert content_type == 'application/json' - assert error['title'] == 'Request REQUESTID: Exception' + assert error['title'] == 'Server Error: Exception' assert error['id'] == {'Request-Id': 'REQUESTID'} assert error['traceback'] == '[traceback hidden]' assert error['request_headers'] == { @@ -136,7 +137,7 @@ def test_error_response_handler_devel(wsgi_env, config): exc_info, ) error = json.loads(body.decode('utf8')) - assert error['title'] == 'Request REQUESTID: test' + assert error['title'] == 'Exception: test' assert error['traceback'][0] == 'Traceback (most recent call last):' assert error['traceback'][-3] == ' raise Exception(\'test\')' assert error['traceback'][-2] == 'Exception: test' @@ -216,7 +217,11 @@ def test_wsgi_request_wrap_file(run_wsgi, context, tmpdir): ) -def test_wsgi_request_wrap_error(run_wsgi, context): +@pytest.mark.parametrize('exc_type', [ + Exception, + wsgi.RequestTimeout, +]) +def test_wsgi_request_wrap_error_in_iterator(exc_type, run_wsgi, context): env = { 'REQUEST_ID': 'REQUESTID', 'HTTP_ACCEPT': 'application/json', @@ -227,27 +232,31 @@ def __iter__(self): return self def __next__(self): - raise Exception('error') + raise exc_type('error') headers, body = run_wsgi(env, body=ErrorGenerator()) output = b''.join(body) error = json.loads(output.decode('utf8')) - assert error['title'] == 'Request REQUESTID: Exception' + assert error['title'] == 'Server Error: ' + exc_type.__name__ - context.assert_log( - msg='GET /', - extra=dict([ - ('method', 'GET'), - ('path', '/'), - ('status', 500), - ('duration_ms', 1000.0), - ('ip', '127.0.0.1'), - ('proto', 'HTTP/1.0'), - ('length', len(output)), - ('exc_type', 'Exception'), - ]), - ) + extra = dict([ + ('method', 'GET'), + ('path', '/'), + ('status', 500), + ('duration_ms', 1000.0), + ('ip', '127.0.0.1'), + ('proto', 'HTTP/1.0'), + ('length', len(output)), + ('exc_type', exc_type.__name__), + ]) + if exc_type is wsgi.RequestTimeout: + extra['timeout'] = True + + context.assert_log(msg='GET /', extra=extra) + + # check it sent a sentry message + assert len(context.sentry) == 1 def test_wsgi_request_wrap_error_headers_sent(run_wsgi, context): @@ -445,11 +454,15 @@ def app(environ, _start_response): assert contexts[0].deadline == datetime_to_timestamp(ts) +@pytest.mark.parametrize('exc_type', [ + Exception, + wsgi.RequestTimeout, +]) def test_middleware_error_before_start_response( - wsgi_env, start_response, context): + exc_type, wsgi_env, start_response, context): def app(environ, _start_response): - raise Exception('error') + raise exc_type('error') extra_env = {'ENV': 'VALUE'} extra_headers = {'Some-Header': 'value'} @@ -460,33 +473,44 @@ def app(environ, _start_response): output = b''.join(mw(wsgi_env, start_response)) error = json.loads(output.decode('utf8')) - assert error['title'] == 'Request ID: Exception' + assert error['title'] == 'Server Error: ' + exc_type.__name__ assert wsgi_env['ENV'] == 'VALUE' assert wsgi_env['REQUEST_ID'] == 'ID' - assert start_response.status == '500 Internal Server Error' - assert start_response.exc_info[0] is Exception + assert start_response.exc_info[0] is exc_type assert start_response.headers[:3] == [ ('Content-Type', 'application/json'), ('Some-Header', 'value'), ('X-Request-Id', 'ID'), ] + extra = { + 'status': 500, + 'exc_type': exc_type.__name__, + } + + if exc_type is wsgi.RequestTimeout: + extra['timeout'] = True + context.assert_log( name='talisker.wsgi', msg='GET /', - extra={ - 'status': 500, - 'exc_type': 'Exception', - }, + extra=extra, ) + # check it sent a sentry message + assert len(context.sentry) == 1 + +@pytest.mark.parametrize('exc_type', [ + Exception, + wsgi.RequestTimeout, +]) def test_middleware_error_after_start_response( - wsgi_env, start_response, context): + exc_type, wsgi_env, start_response, context): def app(wsgi_env, _start_response): _start_response('200 OK', [('Content-Type', 'application/json')]) - raise Exception('error') + raise exc_type('error') extra_env = {'ENV': 'VALUE'} extra_headers = {'Some-Header': 'value'} @@ -497,7 +521,7 @@ def app(wsgi_env, _start_response): output = b''.join(mw(wsgi_env, start_response)) error = json.loads(output.decode('utf8')) - assert error['title'] == 'Request ID: Exception' + assert error['title'] == 'Server Error: ' + exc_type.__name__ assert wsgi_env['ENV'] == 'VALUE' assert wsgi_env['REQUEST_ID'] == 'ID' assert start_response.status == '500 Internal Server Error' @@ -507,15 +531,23 @@ def app(wsgi_env, _start_response): ('X-Request-Id', 'ID'), ] + extra = { + 'status': 500, + 'exc_type': exc_type.__name__, + } + + if exc_type is wsgi.RequestTimeout: + extra['timeout'] = True + context.assert_log( name='talisker.wsgi', msg='GET /', - extra={ - 'status': 500, - 'exc_type': 'Exception', - }, + extra=extra, ) + # check it sent a sentry message + assert len(context.sentry) == 1 + def test_middleware_preserves_file_wrapper( wsgi_env, start_response, context, tmpdir): diff --git a/tests/wsgi_app.py b/tests/wsgi_app.py index 4b552d74..c12ad242 100644 --- a/tests/wsgi_app.py +++ b/tests/wsgi_app.py @@ -29,6 +29,7 @@ import pprint import logging +import time def application(environ, start_response): @@ -44,6 +45,7 @@ def application(environ, start_response): logger.warning('warning') logger.error('error') logger.critical('critical') + time.sleep(10) return [output.encode('utf8')] @@ -54,3 +56,21 @@ def app404(environ, start_response): else: start_response('404 Not Found', []) return [b'Not Found'] + + +def timeout(environ, start_response): + time.sleep(1000) + + +def timeout2(environ, start_response): + start_response('200 OK', [('content-type', 'text/plain')]) + time.sleep(1000) + + +def timeout3(environ, start_response): + start_response('200 OK', [('content-type', 'text/plain')]) + + def i(): + yield time.sleep(1000) + + return i() diff --git a/tox.ini b/tox.ini index 071e323c..d5b04f43 100644 --- a/tox.ini +++ b/tox.ini @@ -7,7 +7,7 @@ skipsdist = True passenv = TRAVIS TRAVIS_JOB_ID TRAVIS_BRANCH usedevelop = True deps = -r{toxinidir}/requirements.tests.txt -commands = py.test -n auto --cov=talisker --no-success-flaky-report +commands = py.test -v --cov=talisker --no-success-flaky-report extras = gunicorn raven