From f80ee15e27b67b6fdd101d5f91cf584d19b2b26e Mon Sep 17 00:00:00 2001 From: Jason Madden Date: Fri, 6 Oct 2023 12:41:59 +0000 Subject: [PATCH] gevent.pywsgi: Much improved handling of chunk trailers. Validation is much stricter to the specification. Fixes #1989 CVE: CVE-2023-41419 Upstream-Status: Backport [https://github.com/gevent/gevent/commit/2f53c851eaf926767fbac62385615efd4886221c] Signed-off-by: Narpat Mali --- docs/changes/1989.bugfix | 26 ++++ src/gevent/pywsgi.py | 229 ++++++++++++++++++++++++------- src/gevent/subprocess.py | 7 +- src/gevent/testing/testcase.py | 2 +- src/gevent/tests/test__pywsgi.py | 193 ++++++++++++++++++++++++-- 5 files changed, 390 insertions(+), 67 deletions(-) create mode 100644 docs/changes/1989.bugfix diff --git a/docs/changes/1989.bugfix b/docs/changes/1989.bugfix new file mode 100644 index 0000000..7ce4a93 --- /dev/null +++ b/docs/changes/1989.bugfix @@ -0,0 +1,26 @@ +Make ``gevent.pywsgi`` comply more closely with the HTTP specification +for chunked transfer encoding. In particular, we are much stricter +about trailers, and trailers that are invalid (too long or featuring +disallowed characters) forcibly close the connection to the client +*after* the results have been sent. + +Trailers otherwise continue to be ignored and are not available to the +WSGI application. + +Previously, carefully crafted invalid trailers in chunked requests on +keep-alive connections might appear as two requests to +``gevent.pywsgi``. Because this was handled exactly as a normal +keep-alive connection with two requests, the WSGI application should +handle it normally. However, if you were counting on some upstream +server to filter incoming requests based on paths or header fields, +and the upstream server simply passed trailers through without +validating them, then this embedded second request would bypass those +checks. (If the upstream server validated that the trailers meet the +HTTP specification, this could not occur, because characters that are +required in an HTTP request, like a space, are not allowed in +trailers.) CVE-2023-41419 was reserved for this. + +Our thanks to the original reporters, Keran Mu +(mkr22@mails.tsinghua.edu.cn) and Jianjun Chen +(jianjun@tsinghua.edu.cn), from Tsinghua University and Zhongguancun +Laboratory. diff --git a/src/gevent/pywsgi.py b/src/gevent/pywsgi.py index 0ebe095..078398a 100644 --- a/src/gevent/pywsgi.py +++ b/src/gevent/pywsgi.py @@ -1,13 +1,28 @@ # Copyright (c) 2005-2009, eventlet contributors # Copyright (c) 2009-2018, gevent contributors """ -A pure-Python, gevent-friendly WSGI server. +A pure-Python, gevent-friendly WSGI server implementing HTTP/1.1. The server is provided in :class:`WSGIServer`, but most of the actual WSGI work is handled by :class:`WSGIHandler` --- a new instance is created for each request. The server can be customized to use different subclasses of :class:`WSGIHandler`. +.. important:: + This server is intended primarily for development and testing, and + secondarily for other "safe" scenarios where it will not be exposed to + potentially malicious input. The code has not been security audited, + and is not intended for direct exposure to the public Internet. For production + usage on the Internet, either choose a production-strength server such as + gunicorn, or put a reverse proxy between gevent and the Internet. +.. versionchanged:: NEXT + Complies more closely with the HTTP specification for chunked transfer encoding. + In particular, we are much stricter about trailers, and trailers that + are invalid (too long or featuring disallowed characters) forcibly close + the connection to the client *after* the results have been sent. + Trailers otherwise continue to be ignored and are not available to the + WSGI application. + """ from __future__ import absolute_import @@ -22,10 +37,7 @@ import time import traceback from datetime import datetime -try: - from urllib import unquote -except ImportError: - from urllib.parse import unquote # python 2 pylint:disable=import-error,no-name-in-module +from urllib.parse import unquote from gevent import socket import gevent @@ -53,29 +65,52 @@ __all__ = [ MAX_REQUEST_LINE = 8192 # Weekday and month names for HTTP date/time formatting; always English! -_WEEKDAYNAME = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] -_MONTHNAME = [None, # Dummy so we can use 1-based month numbers +_WEEKDAYNAME = ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun") +_MONTHNAME = (None, # Dummy so we can use 1-based month numbers "Jan", "Feb", "Mar", "Apr", "May", "Jun", - "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec") # The contents of the "HEX" grammar rule for HTTP, upper and lowercase A-F plus digits, # in byte form for comparing to the network. _HEX = string.hexdigits.encode('ascii') +# The characters allowed in "token" rules. + +# token = 1*tchar +# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" +# / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~" +# / DIGIT / ALPHA +# ; any VCHAR, except delimiters +# ALPHA = %x41-5A / %x61-7A ; A-Z / a-z +_ALLOWED_TOKEN_CHARS = frozenset( + # Remember we have to be careful because bytestrings + # inexplicably iterate as integers, which are not equal to bytes. + + # explicit chars then DIGIT + (c.encode('ascii') for c in "!#$%&'*+-.^_`|~0123456789") + # Then we add ALPHA +) | {c.encode('ascii') for c in string.ascii_letters} +assert b'A' in _ALLOWED_TOKEN_CHARS + + # Errors _ERRORS = {} _INTERNAL_ERROR_STATUS = '500 Internal Server Error' _INTERNAL_ERROR_BODY = b'Internal Server Error' -_INTERNAL_ERROR_HEADERS = [('Content-Type', 'text/plain'), - ('Connection', 'close'), - ('Content-Length', str(len(_INTERNAL_ERROR_BODY)))] +_INTERNAL_ERROR_HEADERS = ( + ('Content-Type', 'text/plain'), + ('Connection', 'close'), + ('Content-Length', str(len(_INTERNAL_ERROR_BODY))) +) _ERRORS[500] = (_INTERNAL_ERROR_STATUS, _INTERNAL_ERROR_HEADERS, _INTERNAL_ERROR_BODY) _BAD_REQUEST_STATUS = '400 Bad Request' _BAD_REQUEST_BODY = '' -_BAD_REQUEST_HEADERS = [('Content-Type', 'text/plain'), - ('Connection', 'close'), - ('Content-Length', str(len(_BAD_REQUEST_BODY)))] +_BAD_REQUEST_HEADERS = ( + ('Content-Type', 'text/plain'), + ('Connection', 'close'), + ('Content-Length', str(len(_BAD_REQUEST_BODY))) +) _ERRORS[400] = (_BAD_REQUEST_STATUS, _BAD_REQUEST_HEADERS, _BAD_REQUEST_BODY) _REQUEST_TOO_LONG_RESPONSE = b"HTTP/1.1 414 Request URI Too Long\r\nConnection: close\r\nContent-length: 0\r\n\r\n" @@ -204,23 +239,32 @@ class Input(object): # Read and return the next integer chunk length. If no # chunk length can be read, raises _InvalidClientInput. - # Here's the production for a chunk: - # (http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html) - # chunk = chunk-size [ chunk-extension ] CRLF - # chunk-data CRLF - # chunk-size = 1*HEX - # chunk-extension= *( ";" chunk-ext-name [ "=" chunk-ext-val ] ) - # chunk-ext-name = token - # chunk-ext-val = token | quoted-string - - # To cope with malicious or broken clients that fail to send valid - # chunk lines, the strategy is to read character by character until we either reach - # a ; or newline. If at any time we read a non-HEX digit, we bail. If we hit a - # ;, indicating an chunk-extension, we'll read up to the next - # MAX_REQUEST_LINE characters - # looking for the CRLF, and if we don't find it, we bail. If we read more than 16 hex characters, - # (the number needed to represent a 64-bit chunk size), we bail (this protects us from - # a client that sends an infinite stream of `F`, for example). + # Here's the production for a chunk (actually the whole body): + # (https://www.rfc-editor.org/rfc/rfc7230#section-4.1) + + # chunked-body = *chunk + # last-chunk + # trailer-part + # CRLF + # + # chunk = chunk-size [ chunk-ext ] CRLF + # chunk-data CRLF + # chunk-size = 1*HEXDIG + # last-chunk = 1*("0") [ chunk-ext ] CRLF + # trailer-part = *( header-field CRLF ) + # chunk-data = 1*OCTET ; a sequence of chunk-size octets + + # To cope with malicious or broken clients that fail to send + # valid chunk lines, the strategy is to read character by + # character until we either reach a ; or newline. If at any + # time we read a non-HEX digit, we bail. If we hit a ;, + # indicating an chunk-extension, we'll read up to the next + # MAX_REQUEST_LINE characters ("A server ought to limit the + # total length of chunk extensions received") looking for the + # CRLF, and if we don't find it, we bail. If we read more than + # 16 hex characters, (the number needed to represent a 64-bit + # chunk size), we bail (this protects us from a client that + # sends an infinite stream of `F`, for example). buf = BytesIO() while 1: @@ -228,16 +272,20 @@ class Input(object): if not char: self._chunked_input_error = True raise _InvalidClientInput("EOF before chunk end reached") - if char == b'\r': - break - if char == b';': + + if char in ( + b'\r', # Beginning EOL + b';', # Beginning extension + ): break - if char not in _HEX: + if char not in _HEX: # Invalid data. self._chunked_input_error = True raise _InvalidClientInput("Non-hex data", char) + buf.write(char) - if buf.tell() > 16: + + if buf.tell() > 16: # Too many hex bytes self._chunked_input_error = True raise _InvalidClientInput("Chunk-size too large.") @@ -257,11 +305,72 @@ class Input(object): if char == b'\r': # We either got here from the main loop or from the # end of an extension + self.__read_chunk_size_crlf(rfile, newline_only=True) + result = int(buf.getvalue(), 16) + if result == 0: + # The only time a chunk size of zero is allowed is the final + # chunk. It is either followed by another \r\n, or some trailers + # which are then followed by \r\n. + while self.__read_chunk_trailer(rfile): + pass + return result + + # Trailers have the following production (they are a header-field followed by CRLF) + # See above for the definition of "token". + # + # header-field = field-name ":" OWS field-value OWS + # field-name = token + # field-value = *( field-content / obs-fold ) + # field-content = field-vchar [ 1*( SP / HTAB ) field-vchar ] + # field-vchar = VCHAR / obs-text + # obs-fold = CRLF 1*( SP / HTAB ) + # ; obsolete line folding + # ; see Section 3.2.4 + + + def __read_chunk_trailer(self, rfile, ): + # With rfile positioned just after a \r\n, read a trailer line. + # Return a true value if a non-empty trailer was read, and + # return false if an empty trailer was read (meaning the trailers are + # done). + # If a single line exceeds the MAX_REQUEST_LINE, raise an exception. + # If the field-name portion contains invalid characters, raise an exception. + + i = 0 + empty = True + seen_field_name = False + while i < MAX_REQUEST_LINE: + char = rfile.read(1) + if char == b'\r': + # Either read the next \n or raise an error. + self.__read_chunk_size_crlf(rfile, newline_only=True) + break + # Not a \r, so we are NOT an empty chunk. + empty = False + if char == b':' and i > 0: + # We're ending the field-name part; stop validating characters. + # Unless : was the first character... + seen_field_name = True + if not seen_field_name and char not in _ALLOWED_TOKEN_CHARS: + raise _InvalidClientInput('Invalid token character: %r' % (char,)) + i += 1 + else: + # We read too much + self._chunked_input_error = True + raise _InvalidClientInput("Too large chunk trailer") + return not empty + + def __read_chunk_size_crlf(self, rfile, newline_only=False): + # Also for safety, correctly verify that we get \r\n when expected. + if not newline_only: char = rfile.read(1) - if char != b'\n': + if char != b'\r': self._chunked_input_error = True - raise _InvalidClientInput("Line didn't end in CRLF") - return int(buf.getvalue(), 16) + raise _InvalidClientInput("Line didn't end in CRLF: %r" % (char,)) + char = rfile.read(1) + if char != b'\n': + self._chunked_input_error = True + raise _InvalidClientInput("Line didn't end in LF: %r" % (char,)) def _chunked_read(self, length=None, use_readline=False): # pylint:disable=too-many-branches @@ -294,7 +403,7 @@ class Input(object): self.position += datalen if self.chunk_length == self.position: - rfile.readline() + self.__read_chunk_size_crlf(rfile) if length is not None: length -= datalen @@ -307,9 +416,9 @@ class Input(object): # determine the next size to read self.chunk_length = self.__read_chunk_length(rfile) self.position = 0 - if self.chunk_length == 0: - # Last chunk. Terminates with a CRLF. - rfile.readline() + # If chunk_length was 0, we already read any trailers and + # validated that we have ended with \r\n\r\n. + return b''.join(response) def read(self, length=None): @@ -532,7 +641,8 @@ class WSGIHandler(object): elif len(words) == 2: self.command, self.path = words if self.command != "GET": - raise _InvalidClientRequest('Expected GET method: %r' % (raw_requestline,)) + raise _InvalidClientRequest('Expected GET method; Got command=%r; path=%r; raw=%r' % ( + self.command, self.path, raw_requestline,)) self.request_version = "HTTP/0.9" # QQQ I'm pretty sure we can drop support for HTTP/0.9 else: @@ -1000,14 +1110,28 @@ class WSGIHandler(object): finally: try: self.wsgi_input._discard() - except (socket.error, IOError): - # Don't let exceptions during discarding + except _InvalidClientInput: + # This one is deliberately raised to the outer + # scope, because, with the incoming stream in some bad state, + # we can't be sure we can synchronize and properly parse the next + # request. + raise + except socket.error + # Don't let socket exceptions during discarding # input override any exception that may have been # raised by the application, such as our own _InvalidClientInput. # In the general case, these aren't even worth logging (see the comment # just below) pass - except _InvalidClientInput: + except _InvalidClientInput as ex: + # DO log this one because: + # - Some of the data may have been read and acted on by the + # application; + # - The response may or may not have been sent; + # - It's likely that the client is bad, or malicious, and + # users might wish to take steps to block the client. + self._handle_client_error(ex) + self.close_connection = True self._send_error_response_if_possible(400) except socket.error as ex: if ex.args[0] in self.ignored_socket_errors: @@ -1054,17 +1178,22 @@ class WSGIHandler(object): def _handle_client_error(self, ex): # Called for invalid client input # Returns the appropriate error response. - if not isinstance(ex, ValueError): + if not isinstance(ex, (ValueError, _InvalidClientInput)): # XXX: Why not self._log_error to send it through the loop's # handle_error method? + # _InvalidClientRequest is a ValueError; _InvalidClientInput is an IOError. traceback.print_exc() if isinstance(ex, _InvalidClientRequest): # No formatting needed, that's already been handled. In fact, because the # formatted message contains user input, it might have a % in it, and attempting # to format that with no arguments would be an error. - self.log_error(ex.formatted_message) + # However, the error messages do not include the requesting IP + # necessarily, so we do add that. + self.log_error('(from %s) %s', self.client_address, ex.formatted_message) else: - self.log_error('Invalid request: %s', str(ex) or ex.__class__.__name__) + self.log_error('Invalid request (from %s): %s', + self.client_address, + str(ex) or ex.__class__.__name__) return ('400', _BAD_REQUEST_RESPONSE) def _headers(self): diff --git a/src/gevent/subprocess.py b/src/gevent/subprocess.py index 38c9bd3..8a8ccad 100644 --- a/src/gevent/subprocess.py +++ b/src/gevent/subprocess.py @@ -352,10 +352,11 @@ def check_output(*popenargs, **kwargs): To capture standard error in the result, use ``stderr=STDOUT``:: - >>> print(check_output(["/bin/sh", "-c", + >>> output = check_output(["/bin/sh", "-c", ... "ls -l non_existent_file ; exit 0"], - ... stderr=STDOUT).decode('ascii').strip()) - ls: non_existent_file: No such file or directory + ... stderr=STDOUT).decode('ascii').strip() + >>> print(output.rsplit(':', 1)[1].strip()) + No such file or directory There is an additional optional argument, "input", allowing you to pass a string to the subprocess's stdin. If you use this argument diff --git a/src/gevent/testing/testcase.py b/src/gevent/testing/testcase.py index cd5db80..aa86dcf 100644 --- a/src/gevent/testing/testcase.py +++ b/src/gevent/testing/testcase.py @@ -225,7 +225,7 @@ class TestCaseMetaClass(type): classDict.pop(key) # XXX: When did we stop doing this? #value = wrap_switch_count_check(value) - value = _wrap_timeout(timeout, value) + #value = _wrap_timeout(timeout, value) error_fatal = getattr(value, 'error_fatal', error_fatal) if error_fatal: value = errorhandler.wrap_error_fatal(value) diff --git a/src/gevent/tests/test__pywsgi.py b/src/gevent/tests/test__pywsgi.py index d2125a8..d46030b 100644 --- a/src/gevent/tests/test__pywsgi.py +++ b/src/gevent/tests/test__pywsgi.py @@ -25,21 +25,11 @@ from gevent import monkey monkey.patch_all() from contextlib import contextmanager -try: - from urllib.parse import parse_qs -except ImportError: - # Python 2 - from urlparse import parse_qs +from urllib.parse import parse_qs import os import sys -try: - # On Python 2, we want the C-optimized version if - # available; it has different corner-case behaviour than - # the Python implementation, and it used by socket.makefile - # by default. - from cStringIO import StringIO -except ImportError: - from io import BytesIO as StringIO +from io import BytesIO as StringIO + import weakref import unittest from wsgiref.validate import validator @@ -156,6 +146,10 @@ class Response(object): @classmethod def read(cls, fd, code=200, reason='default', version='1.1', body=None, chunks=None, content_length=None): + """ + Read an HTTP response, optionally perform assertions, + and return the Response object. + """ # pylint:disable=too-many-branches _status_line, headers = read_headers(fd) self = cls(_status_line, headers) @@ -716,7 +710,14 @@ class TestNegativeReadline(TestCase): class TestChunkedPost(TestCase): + calls = 0 + + def setUp(self): + super().setUp() + self.calls = 0 + def application(self, env, start_response): + self.calls += 1 self.assertTrue(env.get('wsgi.input_terminated')) start_response('200 OK', [('Content-Type', 'text/plain')]) if env['PATH_INFO'] == '/a': @@ -730,6 +731,8 @@ class TestChunkedPost(TestCase): if env['PATH_INFO'] == '/c': return list(iter(lambda: env['wsgi.input'].read(1), b'')) + return [b'We should not get here', env['PATH_INFO'].encode('ascii')] + def test_014_chunked_post(self): data = (b'POST /a HTTP/1.1\r\nHost: localhost\r\nConnection: close\r\n' b'Transfer-Encoding: chunked\r\n\r\n' @@ -797,6 +800,170 @@ class TestChunkedPost(TestCase): fd.write(data) read_http(fd, code=400) + def test_trailers_keepalive_ignored(self): + # Trailers after a chunk are ignored. + data = ( + b'POST /a HTTP/1.1\r\n' + b'Host: localhost\r\n' + b'Connection: keep-alive\r\n' + b'Transfer-Encoding: chunked\r\n' + b'\r\n' + b'2\r\noh\r\n' + b'4\r\n hai\r\n' + b'0\r\n' # last-chunk + # Normally the final CRLF would go here, but if you put in a + # trailer, it doesn't. + b'trailer1: value1\r\n' + b'trailer2: value2\r\n' + b'\r\n' # Really terminate the chunk. + b'POST /a HTTP/1.1\r\n' + b'Host: localhost\r\n' + b'Connection: close\r\n' + b'Transfer-Encoding: chunked\r\n' + b'\r\n' + b'2\r\noh\r\n' + b'4\r\n bye\r\n' + b'0\r\n' # last-chunk + ) + with self.makefile() as fd: + fd.write(data) + read_http(fd, body='oh hai') + read_http(fd, body='oh bye') + + self.assertEqual(self.calls, 2) + + def test_trailers_too_long(self): + # Trailers after a chunk are ignored. + data = ( + b'POST /a HTTP/1.1\r\n' + b'Host: localhost\r\n' + b'Connection: keep-alive\r\n' + b'Transfer-Encoding: chunked\r\n' + b'\r\n' + b'2\r\noh\r\n' + b'4\r\n hai\r\n' + b'0\r\n' # last-chunk + # Normally the final CRLF would go here, but if you put in a + # trailer, it doesn't. + b'trailer2: value2' # not lack of \r\n + ) + data += b't' * pywsgi.MAX_REQUEST_LINE + # No termination, because we detect the trailer as being too + # long and abort the connection. + with self.makefile() as fd: + fd.write(data) + read_http(fd, body='oh hai') + with self.assertRaises(ConnectionClosed): + read_http(fd, body='oh bye') + + def test_trailers_request_smuggling_missing_last_chunk_keep_alive(self): + # When something that looks like a request line comes in the trailer + # as the first line, immediately after an invalid last chunk. + # We detect this and abort the connection, because the + # whitespace in the GET line isn't a legal part of a trailer. + # If we didn't abort the connection, then, because we specified + # keep-alive, the server would be hanging around waiting for more input. + data = ( + b'POST /a HTTP/1.1\r\n' + b'Host: localhost\r\n' + b'Connection: keep-alive\r\n' + b'Transfer-Encoding: chunked\r\n' + b'\r\n' + b'2\r\noh\r\n' + b'4\r\n hai\r\n' + b'0' # last-chunk, but missing the \r\n + # Normally the final CRLF would go here, but if you put in a + # trailer, it doesn't. + # b'\r\n' + b'GET /path2?a=:123 HTTP/1.1\r\n' + b'Host: a.com\r\n' + b'Connection: close\r\n' + b'\r\n' + ) + with self.makefile() as fd: + fd.write(data) + read_http(fd, body='oh hai') + with self.assertRaises(ConnectionClosed): + read_http(fd) + + self.assertEqual(self.calls, 1) + + def test_trailers_request_smuggling_missing_last_chunk_close(self): + # Same as the above, except the trailers are actually valid + # and since we ask to close the connection we don't get stuck + # waiting for more input. + data = ( + b'POST /a HTTP/1.1\r\n' + b'Host: localhost\r\n' + b'Connection: close\r\n' + b'Transfer-Encoding: chunked\r\n' + b'\r\n' + b'2\r\noh\r\n' + b'4\r\n hai\r\n' + b'0\r\n' # last-chunk + # Normally the final CRLF would go here, but if you put in a + # trailer, it doesn't. + # b'\r\n' + b'GETpath2a:123 HTTP/1.1\r\n' + b'Host: a.com\r\n' + b'Connection: close\r\n' + b'\r\n' + ) + with self.makefile() as fd: + fd.write(data) + read_http(fd, body='oh hai') + with self.assertRaises(ConnectionClosed): + read_http(fd) + + def test_trailers_request_smuggling_header_first(self): + # When something that looks like a header comes in the first line. + data = ( + b'POST /a HTTP/1.1\r\n' + b'Host: localhost\r\n' + b'Connection: keep-alive\r\n' + b'Transfer-Encoding: chunked\r\n' + b'\r\n' + b'2\r\noh\r\n' + b'4\r\n hai\r\n' + b'0\r\n' # last-chunk, but only one CRLF + b'Header: value\r\n' + b'GET /path2?a=:123 HTTP/1.1\r\n' + b'Host: a.com\r\n' + b'Connection: close\r\n' + b'\r\n' + ) + with self.makefile() as fd: + fd.write(data) + read_http(fd, body='oh hai') + with self.assertRaises(ConnectionClosed): + read_http(fd, code=400) + + self.assertEqual(self.calls, 1) + + def test_trailers_request_smuggling_request_terminates_then_header(self): + data = ( + b'POST /a HTTP/1.1\r\n' + b'Host: localhost\r\n' + b'Connection: keep-alive\r\n' + b'Transfer-Encoding: chunked\r\n' + b'\r\n' + b'2\r\noh\r\n' + b'4\r\n hai\r\n' + b'0\r\n' # last-chunk + b'\r\n' + b'Header: value' + b'GET /path2?a=:123 HTTP/1.1\r\n' + b'Host: a.com\r\n' + b'Connection: close\r\n' + b'\r\n' + ) + with self.makefile() as fd: + fd.write(data) + read_http(fd, body='oh hai') + read_http(fd, code=400) + + self.assertEqual(self.calls, 1) + class TestUseWrite(TestCase): -- 2.40.0