bazarr/libs/waitress/parser.py

##############################################################################
#
# Copyright (c) 2001, 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""HTTP Request Parser

This server uses asyncore to accept connections and do initial
processing but threads to do work.
"""
import re
from io import BytesIO

from waitress.compat import (
    tostr,
    urlparse,
    unquote_bytes_to_wsgi,
)

from waitress.buffers import OverflowableBuffer

from waitress.receiver import (
    FixedStreamReceiver,
    ChunkedReceiver,
)

from waitress.utilities import (
    find_double_newline,
    RequestEntityTooLarge,
    RequestHeaderFieldsTooLarge,
    BadRequest,
)

class ParsingError(Exception):
    pass

class HTTPRequestParser(object):
    """A structure that collects the HTTP request.

    Once the stream is completed, the instance is passed to
    a server task constructor.
    """
    completed = False        # Set once request is completed.
    empty = False            # Set if no request was made.
    expect_continue = False  # client sent "Expect: 100-continue" header
    headers_finished = False # True when headers have been read
    header_plus = b''
    chunked = False
    content_length = 0
    header_bytes_received = 0
    body_bytes_received = 0
    body_rcv = None
    version = '1.0'
    error = None
    connection_close = False

    # Other attributes: first_line, header, headers, command, uri, version,
    # path, query, fragment

    def __init__(self, adj):
        """
        adj is an Adjustments object.
        """
        # headers is a mapping containing keys translated to uppercase
        # with dashes turned into underscores.
        self.headers = {}
        self.adj = adj

    def received(self, data):
        """
        Receives the HTTP stream for one request.  Returns the number of
        bytes consumed.  Sets the completed flag once both the header and the
        body have been received.
        """
        if self.completed:
            return 0 # Can't consume any more.
        datalen = len(data)
        br = self.body_rcv
        if br is None:
            # In header.
            s = self.header_plus + data
            index = find_double_newline(s)
            if index >= 0:
                # Header finished.
                header_plus = s[:index]
                consumed = len(data) - (len(s) - index)
                # Remove preceeding blank lines.
                header_plus = header_plus.lstrip()
                if not header_plus:
                    self.empty = True
                    self.completed = True
                else:
                    try:
                        self.parse_header(header_plus)
                    except ParsingError as e:
                        self.error = BadRequest(e.args[0])
                        self.completed = True
                    else:
                        if self.body_rcv is None:
                            # no content-length header and not a t-e: chunked
                            # request
                            self.completed = True
                        if self.content_length > 0:
                            max_body = self.adj.max_request_body_size
                            # we won't accept this request if the content-length
                            # is too large
                            if self.content_length >= max_body:
                                self.error = RequestEntityTooLarge(
                                    'exceeds max_body of %s' % max_body)
                                self.completed = True
                self.headers_finished = True
                return consumed
            else:
                # Header not finished yet.
                self.header_bytes_received += datalen
                max_header = self.adj.max_request_header_size
                if self.header_bytes_received >= max_header:
                    # malformed header, we need to construct some request
                    # on our own. we disregard the incoming(?) requests HTTP
                    # version and just use 1.0. IOW someone just sent garbage
                    # over the wire
                    self.parse_header(b'GET / HTTP/1.0\n')
                    self.error = RequestHeaderFieldsTooLarge(
                        'exceeds max_header of %s' % max_header)
                    self.completed = True
                self.header_plus = s
                return datalen
        else:
            # In body.
            consumed = br.received(data)
            self.body_bytes_received += consumed
            max_body = self.adj.max_request_body_size
            if self.body_bytes_received >= max_body:
                # this will only be raised during t-e: chunked requests
                self.error = RequestEntityTooLarge(
                    'exceeds max_body of %s' % max_body)
                self.completed = True
            elif br.error:
                # garbage in chunked encoding input probably
                self.error = br.error
                self.completed = True
            elif br.completed:
                # The request (with the body) is ready to use.
                self.completed = True
                if self.chunked:
                    # We've converted the chunked transfer encoding request
                    # body into a normal request body, so we know its content
                    # length; set the header here.  We already popped the
                    # TRANSFER_ENCODING header in parse_header, so this will
                    # appear to the client to be an entirely non-chunked HTTP
                    # request with a valid content-length.
                    self.headers['CONTENT_LENGTH'] = str(br.__len__())
            return consumed

    def parse_header(self, header_plus):
        """
        Parses the header_plus block of text (the headers plus the
        first line of the request).
        """
        index = header_plus.find(b'\n')
        if index >= 0:
            first_line = header_plus[:index].rstrip()
            header = header_plus[index + 1:]
        else:
            first_line = header_plus.rstrip()
            header = b''

        self.first_line = first_line # for testing

        lines = get_header_lines(header)

        headers = self.headers
        for line in lines:
            index = line.find(b':')
            if index > 0:
                key = line[:index]
                if b'_' in key:
                    continue
                value = line[index + 1:].strip()
                key1 = tostr(key.upper().replace(b'-', b'_'))
                # If a header already exists, we append subsequent values
                # seperated by a comma. Applications already need to handle
                # the comma seperated values, as HTTP front ends might do
                # the concatenation for you (behavior specified in RFC2616).
                try:
                    headers[key1] += tostr(b', ' + value)
                except KeyError:
                    headers[key1] = tostr(value)
            # else there's garbage in the headers?

        # command, uri, version will be bytes
        command, uri, version = crack_first_line(first_line)
        version = tostr(version)
        command = tostr(command)
        self.command = command
        self.version = version
        (self.proxy_scheme,
         self.proxy_netloc,
         self.path,
         self.query, self.fragment) = split_uri(uri)
        self.url_scheme = self.adj.url_scheme
        connection = headers.get('CONNECTION', '')

        if version == '1.0':
            if connection.lower() != 'keep-alive':
                self.connection_close = True

        if version == '1.1':
            # since the server buffers data from chunked transfers and clients
            # never need to deal with chunked requests, downstream clients
            # should not see the HTTP_TRANSFER_ENCODING header; we pop it
            # here
            te = headers.pop('TRANSFER_ENCODING', '')
            if te.lower() == 'chunked':
                self.chunked = True
                buf = OverflowableBuffer(self.adj.inbuf_overflow)
                self.body_rcv = ChunkedReceiver(buf)
            expect = headers.get('EXPECT', '').lower()
            self.expect_continue = expect == '100-continue'
            if connection.lower() == 'close':
                self.connection_close = True

        if not self.chunked:
            try:
                cl = int(headers.get('CONTENT_LENGTH', 0))
            except ValueError:
                cl = 0
            self.content_length = cl
            if cl > 0:
                buf = OverflowableBuffer(self.adj.inbuf_overflow)
                self.body_rcv = FixedStreamReceiver(cl, buf)

    def get_body_stream(self):
        body_rcv = self.body_rcv
        if body_rcv is not None:
            return body_rcv.getfile()
        else:
            return BytesIO()

    def close(self):
        body_rcv = self.body_rcv
        if body_rcv is not None:
            body_rcv.getbuf().close()

def split_uri(uri):
    # urlsplit handles byte input by returning bytes on py3, so
    # scheme, netloc, path, query, and fragment are bytes
    try:
        scheme, netloc, path, query, fragment = urlparse.urlsplit(uri)
    except UnicodeError:
        raise ParsingError('Bad URI')
    return (
        tostr(scheme),
        tostr(netloc),
        unquote_bytes_to_wsgi(path),
        tostr(query),
        tostr(fragment),
    )

def get_header_lines(header):
    """
    Splits the header into lines, putting multi-line headers together.
    """
    r = []
    lines = header.split(b'\n')
    for line in lines:
        if line.startswith((b' ', b'\t')):
            if not r:
                # http://corte.si/posts/code/pathod/pythonservers/index.html
                raise ParsingError('Malformed header line "%s"' % tostr(line))
            r[-1] += line
        else:
            r.append(line)
    return r

first_line_re = re.compile(
    b'([^ ]+) '
    b'((?:[^ :?#]+://[^ ?#/]*(?:[0-9]{1,5})?)?[^ ]+)'
    b'(( HTTP/([0-9.]+))$|$)'
)

def crack_first_line(line):
    m = first_line_re.match(line)
    if m is not None and m.end() == len(line):
        if m.group(3):
            version = m.group(5)
        else:
            version = None
        method = m.group(1)

        # the request methods that are currently defined are all uppercase:
        # https://www.iana.org/assignments/http-methods/http-methods.xhtml and
        # the request method is case sensitive according to
        # https://tools.ietf.org/html/rfc7231#section-4.1

        # By disallowing anything but uppercase methods we save poor
        # unsuspecting souls from sending lowercase HTTP methods to waitress
        # and having the request complete, while servers like nginx drop the
        # request onto the floor.
        if method != method.upper():
            raise ParsingError('Malformed HTTP method "%s"' % tostr(method))
        uri = m.group(2)
        return method, uri, version
    else:
        return b'', b'', b''