BodyProcessor.java

package com.renomad.minum.web;

import com.renomad.minum.logging.ILogger;
import com.renomad.minum.security.ForbiddenUseException;
import com.renomad.minum.state.Constants;
import com.renomad.minum.state.Context;
import com.renomad.minum.utils.StringUtils;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * This code is responsible for extracting the {@link Body} from
 * an HTTP request.
 */
final class BodyProcessor implements IBodyProcessor {

    private final ILogger logger;
    private final IInputStreamUtils inputStreamUtils;
    private final Constants constants;

    BodyProcessor(Context context) {
        this.constants = context.getConstants();
        this.logger = context.getLogger();
        this.inputStreamUtils = new InputStreamUtils(constants.maxReadLineSizeBytes);
    }

    @Override
    public Body extractData(InputStream is, Headers h) {
        final var contentType = h.contentType();

        if (h.contentLength() >= 0) {
            if (h.contentLength() >= constants.maxReadSizeBytes) {
                throw new ForbiddenUseException("It is disallowed to process a body with a length more than " + constants.maxReadSizeBytes + " bytes");
            }
        } else {
            // we don't process chunked transfer encodings.  just bail.
            List<String> transferEncodingHeaders = h.valueByKey("transfer-encoding");
            if (List.of("chunked").equals(transferEncodingHeaders)) {
                logger.logDebug(() -> "client sent chunked transfer-encoding.  Minum does not automatically read bodies of this type.");
            }
            return Body.EMPTY;
        }

        return extractBodyFromInputStream(h.contentLength(), contentType, is);
    }

    /**
     * Handles the parsing of the body data for either form-urlencoded or
     * multipart/form-data
     *
     * @param contentType a mime value which must be either application/x-www-form-urlencoded
     *                    or multipart/form-data.  Anything else will cause a new Body to
     *                    be created with the body bytes, unparsed.  There are a number of
     *                    cases where this makes sense - if the user is sending us plain text,
     *                    html, json, or css, we want to simply accept the data and not try to parse it.
     */
    Body extractBodyFromInputStream(int contentLength, String contentType, InputStream is) {
        // if the body is zero bytes long, just return
        if (contentLength == 0) {
            logger.logDebug(() -> "the length of the body was 0, returning an empty Body");
            return Body.EMPTY;
        }

        if (contentType.contains("application/x-www-form-urlencoded")) {
            return parseUrlEncodedForm(is, contentLength);
        } else if (contentType.contains("multipart/form-data")) {
            String boundaryValue = determineBoundaryValue(contentType);
            return parseMultipartForm(contentLength, boundaryValue, is);
        } else {
            logger.logDebug(() -> "did not recognize a key-value pattern content-type, returning the raw bytes for the body.  Content-Type was: " + contentType);
            // we can return the whole byte array here because we never read from it
            return new Body(Map.of(), inputStreamUtils.read(contentLength, is), List.of(), BodyType.UNRECOGNIZED);
        }
    }

    /**
     * Parse multipart/form protocol.
     * @param contentLength the length of incoming data, found in the "content-length" header
     * @param boundaryValue the randomly-generated boundary value between the partitions.  Research
     *                      multipart/form data protocol for further information.
     * @param inputStream A stream of bytes coming from the socket.
     */
    private Body parseMultipartForm(int contentLength, String boundaryValue, InputStream inputStream) {

        if (boundaryValue.isBlank()) {
            logger.logDebug(() -> "The boundary value was blank for the multipart input. Returning an empty map");
            return new Body(Map.of(), new byte[0], List.of(), BodyType.UNRECOGNIZED);
        }

        List<Partition> partitions = new ArrayList<>();

        try {
            int countOfPartitions = 0;
            for (StreamingMultipartPartition p : getMultiPartIterable(inputStream, boundaryValue, contentLength)) {
                countOfPartitions += 1;
                if (countOfPartitions >= MAX_BODY_KEYS_URL_ENCODED) {
                    throw new WebServerException("Error: body had excessive number of partitions (" + countOfPartitions + ").  Maximum allowed: " + MAX_BODY_KEYS_URL_ENCODED);
                }
                partitions.add(new Partition(p.getHeaders(), p.readAllBytes(), p.getContentDisposition()));
            }


        } catch (Exception ex) {
            logger.logDebug(() -> "Unable to parse this body. returning what we have so far.  Exception message: " + ex.getMessage());
            // we have to return nothing for the raw bytes, because at this point we are halfway through
            // reading the inputstream and don't want to return broken data
            return new Body(Map.of(), new byte[0], partitions, BodyType.MULTIPART);
        }
        if (partitions.isEmpty()) {
            return new Body(Map.of(), new byte[0], List.of(), BodyType.UNRECOGNIZED);
        } else {
            return new Body(Map.of(), new byte[0], partitions, BodyType.MULTIPART);
        }
    }

    /**
     * Given the "content-type" header, determine the boundary value.  A typical
     * multipart content-type header might look like this: <pre>Content-Type: multipart/form-data; boundary=i_am_a_boundary</pre>
     */
    private static String determineBoundaryValue(String contentType) {
        String boundaryKey = "boundary=";
        String boundaryValue = "";
        int indexOfBoundaryKey = contentType.indexOf(boundaryKey);
        if (indexOfBoundaryKey > 0) {
            // grab all the text after the key to obtain the boundary value
            boundaryValue = contentType.substring(indexOfBoundaryKey + boundaryKey.length());
        }
        return boundaryValue;
    }


    /**
     * Parse data formatted by application/x-www-form-urlencoded
     * See <a href="https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/POST">...</a>
     * <p>
     * See here for the encoding: <a href="https://developer.mozilla.org/en-US/docs/Glossary/percent-encoding">...</a>
     * <p>
     * for example, {@code valuea=3&valueb=this+is+something}
     */
    Body parseUrlEncodedForm(InputStream is, int contentLength) {
        if (contentLength == 0) {
            return Body.EMPTY;
        }
        final var postedPairs = new HashMap<String, byte[]>();

        try {
            int countOfPartitions = 0;
            for (final var keyValue : getUrlEncodedDataIterable(is, contentLength)) {
                countOfPartitions += 1;
                if (countOfPartitions >= MAX_BODY_KEYS_URL_ENCODED) {
                    throw new WebServerException("Error: body had excessive number of partitions ("+countOfPartitions+").  Maximum allowed: " + MAX_BODY_KEYS_URL_ENCODED);
                }
                String value = new String(keyValue.getUedg().readAllBytes(), StandardCharsets.US_ASCII);
                String key = keyValue.getKey();
                final var decodedValue = StringUtils.decode(value);
                final var convertedValue = decodedValue == null ? "".getBytes(StandardCharsets.UTF_8) : decodedValue.getBytes(StandardCharsets.UTF_8);

                final var result = postedPairs.put(key, convertedValue);

                if (result != null) {
                    throw new WebServerException("Error: key (" +key + ") was duplicated in the post body - previous version was " + new String(result, StandardCharsets.US_ASCII) + " and recent data was " + decodedValue);
                }
            }
        } catch (Exception ex) {
            logger.logDebug(() -> "Unable to parse this body. returning what we have so far.  Exception message: " + ex.getMessage());
            // we have to return nothing for the raw bytes, because at this point we are halfway through
            // reading the inputstream and don't want to return broken data
            return new Body(postedPairs, new byte[0], List.of(), BodyType.UNRECOGNIZED);
        }
        // we return nothing for the raw bytes because the code for parsing the streaming data
        // doesn't begin with a fully-read byte array - it pulls data off the stream one byte
        // at a time.
        return new Body(postedPairs, new byte[0], List.of(), BodyType.FORM_URL_ENCODED);
    }

    /**
     * A regex used to extract the name value from the headers in multipart/form
     * For example, in the following code, you can see that the name is "image_uploads"
     * <pre>
     * {@code
     * --i_am_a_boundary
     * Content-Type: text/plain
     * Content-Disposition: form-data; name="text1"
     *
     * I am a value that is text
     * --i_am_a_boundary
     * Content-Type: application/octet-stream
     * Content-Disposition: form-data; name="image_uploads"; filename="photo_preview.jpg"
     * }
     * </pre>
     */
    private static final Pattern multiformNameRegex = Pattern.compile("\\bname\\b=\"(?<namevalue>.*?)\"");
    private static final Pattern multiformFilenameRegex = Pattern.compile("\\bfilename\\b=\"(?<namevalue>.*?)\"");

    @Override
    public Iterable<UrlEncodedKeyValue> getUrlEncodedDataIterable(InputStream inputStream, long contentLength) {
        return () -> new Iterator<>() {

            final CountBytesRead countBytesRead = new CountBytesRead();

            @Override
            public boolean hasNext() {
                return countBytesRead.getCount() < contentLength;
            }

            @Override
            public UrlEncodedKeyValue next() {
                if (!hasNext()) {
                    throw new NoSuchElementException();
                }
                String key = "";
                ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
                while(true) {
                    int result = 0;
                    try {
                        result = inputStream.read();
                        countBytesRead.increment();
                    } catch (IOException e) {
                        throw new WebServerException(e);
                    }
                    // if this is true, the inputstream is closed
                    if (result == -1) break;
                    byte myByte = (byte) result;
                    // if this is true, we're done with the key
                    if (myByte == '=') {
                        // URL encoding is in ASCII only.
                        key = byteArrayOutputStream.toString(StandardCharsets.US_ASCII);
                        break;
                    } else {
                        if (byteArrayOutputStream.size() >= MAX_KEY_SIZE_BYTES) {
                            throw new WebServerException("Maximum size for name attribute is " + MAX_KEY_SIZE_BYTES + " ascii characters");
                        }
                        byteArrayOutputStream.write(myByte);
                    }
                }

                if (key.isBlank()) {
                    throw new WebServerException("Unable to parse this body. no key found during parsing");
                } else if (countBytesRead.getCount() == contentLength) {
                    // if the only thing sent was the key and there's no further data, return the key with a null input stream
                    // that will immediately return
                    return new UrlEncodedKeyValue(key, new UrlEncodedDataGetter(InputStream.nullInputStream(), countBytesRead, contentLength));
                } else {
                    return new UrlEncodedKeyValue(key, new UrlEncodedDataGetter(inputStream, countBytesRead, contentLength));
                }
            }
        };
    }


    @Override
    public Iterable<StreamingMultipartPartition> getMultiPartIterable(InputStream inputStream, String boundaryValue, int contentLength) {
        return () -> new Iterator<>() {

            final CountBytesRead countBytesRead = new CountBytesRead();
            boolean hasReadFirstPartition = false;

            @Override
            public boolean hasNext() {
                // determining if we have more to read is a little tricky because we have a buffer
                // filled by reading ahead, looking for the boundary value
                return (contentLength - countBytesRead.getCount()) > boundaryValue.length();
            }

            @Override
            public StreamingMultipartPartition next() {
                if (!hasNext()) {
                    throw new NoSuchElementException();
                }
                // confirm that the boundary value is as expected, as a sanity check,
                // and avoid including the boundary value in the first set of headers
                if (! hasReadFirstPartition) {
                    String s;
                    try {
                        s = inputStreamUtils.readLine(inputStream);
                        countBytesRead.incrementBy(s.length() + 2);
                        hasReadFirstPartition = true;
                        if (!s.contains(boundaryValue)) {
                            throw new IOException("Error: First line must contain the expected boundary value. Expected to find: "+ boundaryValue + " in: " + s);
                        }
                    } catch (IOException e) {
                        throw new WebServerException(e);
                    }
                }
                List<String> allHeaders = Headers.getAllHeaders(inputStream, inputStreamUtils);
                int lengthOfHeaders = allHeaders.stream().map(String::length).reduce(0, Integer::sum);
                // each line has a CR + LF (that's two bytes) and the headers end with a second pair of CR+LF.
                int extraCrLfs = (2 * allHeaders.size()) + 2;
                countBytesRead.incrementBy(lengthOfHeaders + extraCrLfs);

                Headers headers = new Headers(allHeaders);

                List<String> cds = headers.valueByKey("Content-Disposition");
                if (cds == null) {
                    throw new WebServerException("Error: no Content-Disposition header on partition in Multipart/form data");
                }
                String contentDisposition = String.join(";", cds);

                Matcher nameMatcher = multiformNameRegex.matcher(contentDisposition);
                Matcher filenameMatcher = multiformFilenameRegex.matcher(contentDisposition);

                String name = "";
                if (nameMatcher.find()) {
                    name = nameMatcher.group("namevalue");
                } else {
                    throw new WebServerException("Error: No name value set on multipart partition");
                }
                String filename = "";
                if (filenameMatcher.find()) {
                    filename = filenameMatcher.group("namevalue");
                }

                // at this point our inputstream pointer is at the beginning of the
                // body data.  From here until the end it's pure data.

                return new StreamingMultipartPartition(headers, inputStream, new ContentDisposition(name, filename), boundaryValue, countBytesRead, contentLength);
            }


        };
    }

}