/* $NetBSD: picohttpparser.c,v 1.2.2.2 2024/02/25 15:47:17 martin Exp $ */ /* * Copyright (c) 2009-2014 Kazuho Oku, Tokuhiro Matsuno, Daisuke Murase, * Shigeo Mitsunari * * SPDX-License-Identifier: MIT * * The software is licensed under either the MIT License (below) or the Perl * license. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include #include #include #ifdef __SSE4_2__ #ifdef _MSC_VER #include #else #include #endif #endif #include "picohttpparser.h" #if __GNUC__ >= 3 #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #else #define likely(x) (x) #define unlikely(x) (x) #endif #ifdef _MSC_VER #define ALIGNED(n) _declspec(align(n)) #else #define ALIGNED(n) __attribute__((aligned(n))) #endif #define IS_PRINTABLE_ASCII(c) ((unsigned char)(c)-040u < 0137u) #define CHECK_EOF() \ if (buf == buf_end) { \ *ret = -2; \ return NULL; \ } #define EXPECT_CHAR_NO_CHECK(ch) \ if (*buf++ != ch) { \ *ret = -1; \ return NULL; \ } #define EXPECT_CHAR(ch) \ CHECK_EOF(); \ EXPECT_CHAR_NO_CHECK(ch); #define ADVANCE_TOKEN(tok, toklen) \ do { \ const char *tok_start = buf; \ static const char ALIGNED(16) \ ranges2[16] = "\000\040\177\177"; \ int found2; \ buf = findchar_fast(buf, buf_end, ranges2, 4, &found2); \ if (!found2) { \ CHECK_EOF(); \ } \ while (1) { \ if (*buf == ' ') { \ break; \ } else if (unlikely(!IS_PRINTABLE_ASCII(*buf))) { \ if ((unsigned char)*buf < '\040' || \ *buf == '\177') \ { \ *ret = -1; \ return NULL; \ } \ } \ ++buf; \ CHECK_EOF(); \ } \ tok = tok_start; \ toklen = buf - tok_start; \ } while (0) static const char *token_char_map = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\1\0\1\1\1\1\1\0\0\1\1\0\1\1\0\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0" "\0\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\1\1" "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\1\0\1\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; static const char * findchar_fast(const char *buf, const char *buf_end, const char *ranges, size_t ranges_size, int *found) { *found = 0; #if __SSE4_2__ if (likely(buf_end - buf >= 16)) { __m128i ranges16 = _mm_loadu_si128((const __m128i *)ranges); size_t left = (buf_end - buf) & ~15; do { __m128i b16 = _mm_loadu_si128((const __m128i *)buf); int r = _mm_cmpestri(ranges16, ranges_size, b16, 16, _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS); if (unlikely(r != 16)) { buf += r; *found = 1; break; } buf += 16; left -= 16; } while (likely(left != 0)); } #else /* suppress unused parameter warning */ (void)buf_end; (void)ranges; (void)ranges_size; #endif return buf; } static const char * get_token_to_eol(const char *buf, const char *buf_end, const char **token, size_t *token_len, int *ret) { const char *token_start = buf; #ifdef __SSE4_2__ static const char ALIGNED(16) ranges1[16] = "\0\010" /* allow HT */ "\012\037" /* allow SP and up to but not including DEL */ "\177\177"; /* allow chars w. MSB set */ int found; buf = findchar_fast(buf, buf_end, ranges1, 6, &found); if (found) goto FOUND_CTL; #else /* find non-printable char within the next 8 bytes, this is the hottest * code; manually inlined */ while (likely(buf_end - buf >= 8)) { #define DOIT() \ do { \ if (unlikely(!IS_PRINTABLE_ASCII(*buf))) \ goto NonPrintable; \ ++buf; \ } while (0) DOIT(); DOIT(); DOIT(); DOIT(); DOIT(); DOIT(); DOIT(); DOIT(); #undef DOIT continue; NonPrintable: if ((likely((unsigned char)*buf < '\040') && likely(*buf != '\011')) || unlikely(*buf == '\177')) { goto FOUND_CTL; } ++buf; } #endif for (;; ++buf) { CHECK_EOF(); if (unlikely(!IS_PRINTABLE_ASCII(*buf))) { if ((likely((unsigned char)*buf < '\040') && likely(*buf != '\011')) || unlikely(*buf == '\177')) { goto FOUND_CTL; } } } FOUND_CTL: if (likely(*buf == '\015')) { ++buf; EXPECT_CHAR('\012'); *token_len = buf - 2 - token_start; } else if (*buf == '\012') { *token_len = buf - token_start; ++buf; } else { *ret = -1; return NULL; } *token = token_start; return buf; } static const char * is_complete(const char *buf, const char *buf_end, size_t last_len, int *ret) { int ret_cnt = 0; buf = last_len < 3 ? buf : buf + last_len - 3; while (1) { CHECK_EOF(); if (*buf == '\015') { ++buf; CHECK_EOF(); EXPECT_CHAR('\012'); ++ret_cnt; } else if (*buf == '\012') { ++buf; ++ret_cnt; } else { ++buf; ret_cnt = 0; } if (ret_cnt == 2) { return buf; } } *ret = -2; return NULL; } #define PARSE_INT(valp_, mul_) \ if (*buf < '0' || '9' < *buf) { \ buf++; \ *ret = -1; \ return NULL; \ } \ *(valp_) = (mul_) * (*buf++ - '0'); #define PARSE_INT_3(valp_) \ do { \ int res_ = 0; \ PARSE_INT(&res_, 100) \ *valp_ = res_; \ PARSE_INT(&res_, 10) \ *valp_ += res_; \ PARSE_INT(&res_, 1) \ *valp_ += res_; \ } while (0) /* returned pointer is always within [buf, buf_end), or null */ static const char * parse_token(const char *buf, const char *buf_end, const char **token, size_t *token_len, char next_char, int *ret) { /* We use pcmpestri to detect non-token characters. This instruction can * take no more than eight character ranges (8*2*8=128 bits that is the * size of a SSE register). Due to this restriction, characters `|` and * `~` are handled in the slow loop. */ static const char ALIGNED(16) ranges[] = "\x00 " /* control chars and up to SP */ "\"\"" /* 0x22 */ "()" /* 0x28,0x29 */ ",," /* 0x2c */ "//" /* 0x2f */ ":@" /* 0x3a-0x40 */ "[]" /* 0x5b-0x5d */ "{\xff"; /* 0x7b-0xff */ const char *buf_start = buf; int found; buf = findchar_fast(buf, buf_end, ranges, sizeof(ranges) - 1, &found); if (!found) { CHECK_EOF(); } while (1) { if (*buf == next_char) { break; } else if (!token_char_map[(unsigned char)*buf]) { *ret = -1; return NULL; } ++buf; CHECK_EOF(); } *token = buf_start; *token_len = buf - buf_start; return buf; } /* returned pointer is always within [buf, buf_end), or null */ static const char * parse_http_version(const char *buf, const char *buf_end, int *minor_version, int *ret) { /* we want at least [HTTP/1.] to try to parse */ if (buf_end - buf < 9) { *ret = -2; return NULL; } EXPECT_CHAR_NO_CHECK('H'); EXPECT_CHAR_NO_CHECK('T'); EXPECT_CHAR_NO_CHECK('T'); EXPECT_CHAR_NO_CHECK('P'); EXPECT_CHAR_NO_CHECK('/'); EXPECT_CHAR_NO_CHECK('1'); EXPECT_CHAR_NO_CHECK('.'); PARSE_INT(minor_version, 1); return buf; } static const char * parse_headers(const char *buf, const char *buf_end, struct phr_header *headers, size_t *num_headers, size_t max_headers, int *ret) { for (;; ++*num_headers) { CHECK_EOF(); if (*buf == '\015') { ++buf; EXPECT_CHAR('\012'); break; } else if (*buf == '\012') { ++buf; break; } if (*num_headers == max_headers) { *ret = -1; return NULL; } if (!(*num_headers != 0 && (*buf == ' ' || *buf == '\t'))) { /* parsing name, but do not discard SP before colon, see * http://www.mozilla.org/security/announce/2006/mfsa2006-33.html */ if ((buf = parse_token(buf, buf_end, &headers[*num_headers].name, &headers[*num_headers].name_len, ':', ret)) == NULL) { return NULL; } if (headers[*num_headers].name_len == 0) { *ret = -1; return NULL; } ++buf; for (;; ++buf) { CHECK_EOF(); if (!(*buf == ' ' || *buf == '\t')) { break; } } } else { headers[*num_headers].name = NULL; headers[*num_headers].name_len = 0; } const char *value; size_t value_len; if ((buf = get_token_to_eol(buf, buf_end, &value, &value_len, ret)) == NULL) { return NULL; } /* remove trailing SPs and HTABs */ const char *value_end = value + value_len; for (; value_end != value; --value_end) { const char c = *(value_end - 1); if (!(c == ' ' || c == '\t')) { break; } } headers[*num_headers].value = value; headers[*num_headers].value_len = value_end - value; } return buf; } static const char * parse_request(const char *buf, const char *buf_end, const char **method, size_t *method_len, const char **path, size_t *path_len, int *minor_version, struct phr_header *headers, size_t *num_headers, size_t max_headers, int *ret) { /* skip first empty line (some clients add CRLF after POST content) */ CHECK_EOF(); if (*buf == '\015') { ++buf; EXPECT_CHAR('\012'); } else if (*buf == '\012') { ++buf; } /* parse request line */ if ((buf = parse_token(buf, buf_end, method, method_len, ' ', ret)) == NULL) { return NULL; } do { ++buf; CHECK_EOF(); } while (*buf == ' '); ADVANCE_TOKEN(*path, *path_len); do { ++buf; CHECK_EOF(); } while (*buf == ' '); if (*method_len == 0 || *path_len == 0) { *ret = -1; return NULL; } if ((buf = parse_http_version(buf, buf_end, minor_version, ret)) == NULL) { return NULL; } if (*buf == '\015') { ++buf; EXPECT_CHAR('\012'); } else if (*buf == '\012') { ++buf; } else { *ret = -1; return NULL; } return parse_headers(buf, buf_end, headers, num_headers, max_headers, ret); } int phr_parse_request(const char *buf_start, size_t len, const char **method, size_t *method_len, const char **path, size_t *path_len, int *minor_version, struct phr_header *headers, size_t *num_headers, size_t last_len) { const char *buf = buf_start, *buf_end = buf_start + len; size_t max_headers = *num_headers; int r = -1; *method = NULL; *method_len = 0; *path = NULL; *path_len = 0; *minor_version = -1; *num_headers = 0; /* if last_len != 0, check if the request is complete (a fast countermeasure againt slowloris */ if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) { return r; } if ((buf = parse_request(buf, buf_end, method, method_len, path, path_len, minor_version, headers, num_headers, max_headers, &r)) == NULL) { return r; } return (int)(buf - buf_start); } static const char * parse_response(const char *buf, const char *buf_end, int *minor_version, int *status, const char **msg, size_t *msg_len, struct phr_header *headers, size_t *num_headers, size_t max_headers, int *ret) { /* parse "HTTP/1.x" */ if ((buf = parse_http_version(buf, buf_end, minor_version, ret)) == NULL) { return NULL; } /* skip space */ if (*buf != ' ') { *ret = -1; return NULL; } do { ++buf; CHECK_EOF(); } while (*buf == ' '); /* parse status code, we want at least [:digit:][:digit:][:digit:] to try to parse */ if (buf_end - buf < 4) { *ret = -2; return NULL; } PARSE_INT_3(status); /* get message including preceding space */ if ((buf = get_token_to_eol(buf, buf_end, msg, msg_len, ret)) == NULL) { return NULL; } if (*msg_len == 0) { /* ok */ } else if (**msg == ' ') { /* Remove preceding space. Successful return from * `get_token_to_eol` guarantees that we would hit something * other than SP before running past the end of the given * buffer. */ do { ++*msg; --*msg_len; } while (**msg == ' '); } else { /* garbage found after status code */ *ret = -1; return NULL; } return parse_headers(buf, buf_end, headers, num_headers, max_headers, ret); } int phr_parse_response(const char *buf_start, size_t len, int *minor_version, int *status, const char **msg, size_t *msg_len, struct phr_header *headers, size_t *num_headers, size_t last_len) { const char *buf = buf_start, *buf_end = buf + len; size_t max_headers = *num_headers; int r; *minor_version = -1; *status = 0; *msg = NULL; *msg_len = 0; *num_headers = 0; /* if last_len != 0, check if the response is complete (a fast countermeasure against slowloris */ if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) { return r; } if ((buf = parse_response(buf, buf_end, minor_version, status, msg, msg_len, headers, num_headers, max_headers, &r)) == NULL) { return r; } return (int)(buf - buf_start); } int phr_parse_headers(const char *buf_start, size_t len, struct phr_header *headers, size_t *num_headers, size_t last_len) { const char *buf = buf_start, *buf_end = buf + len; size_t max_headers = *num_headers; int r; *num_headers = 0; /* if last_len != 0, check if the response is complete (a fast countermeasure against slowloris */ if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) { return r; } if ((buf = parse_headers(buf, buf_end, headers, num_headers, max_headers, &r)) == NULL) { return r; } return (int)(buf - buf_start); } enum { CHUNKED_IN_CHUNK_SIZE, CHUNKED_IN_CHUNK_EXT, CHUNKED_IN_CHUNK_DATA, CHUNKED_IN_CHUNK_CRLF, CHUNKED_IN_TRAILERS_LINE_HEAD, CHUNKED_IN_TRAILERS_LINE_MIDDLE }; static int decode_hex(int ch) { if ('0' <= ch && ch <= '9') { return ch - '0'; } else if ('A' <= ch && ch <= 'F') { return ch - 'A' + 0xa; } else if ('a' <= ch && ch <= 'f') { return ch - 'a' + 0xa; } else { return -1; } } ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_t *_bufsz) { size_t dst = 0, src = 0, bufsz = *_bufsz; ssize_t ret = -2; /* incomplete */ while (1) { switch (decoder->_state) { case CHUNKED_IN_CHUNK_SIZE: for (;; ++src) { int v; if (src == bufsz) goto Exit; if ((v = decode_hex(buf[src])) == -1) { if (decoder->_hex_count == 0) { ret = -1; goto Exit; } break; } if (decoder->_hex_count == sizeof(size_t) * 2) { ret = -1; goto Exit; } decoder->bytes_left_in_chunk = decoder->bytes_left_in_chunk * 16 + v; ++decoder->_hex_count; } decoder->_hex_count = 0; decoder->_state = CHUNKED_IN_CHUNK_EXT; /* fallthru */ case CHUNKED_IN_CHUNK_EXT: /* RFC 7230 A.2 "Line folding in chunk extensions is * disallowed" */ for (;; ++src) { if (src == bufsz) goto Exit; if (buf[src] == '\012') break; } ++src; if (decoder->bytes_left_in_chunk == 0) { if (decoder->consume_trailer) { decoder->_state = CHUNKED_IN_TRAILERS_LINE_HEAD; break; } else { goto Complete; } } decoder->_state = CHUNKED_IN_CHUNK_DATA; /* fallthru */ case CHUNKED_IN_CHUNK_DATA: { size_t avail = bufsz - src; if (avail < decoder->bytes_left_in_chunk) { if (dst != src) memmove(buf + dst, buf + src, avail); src += avail; dst += avail; decoder->bytes_left_in_chunk -= avail; goto Exit; } if (dst != src) memmove(buf + dst, buf + src, decoder->bytes_left_in_chunk); src += decoder->bytes_left_in_chunk; dst += decoder->bytes_left_in_chunk; decoder->bytes_left_in_chunk = 0; decoder->_state = CHUNKED_IN_CHUNK_CRLF; } /* fallthru */ case CHUNKED_IN_CHUNK_CRLF: for (;; ++src) { if (src == bufsz) goto Exit; if (buf[src] != '\015') break; } if (buf[src] != '\012') { ret = -1; goto Exit; } ++src; decoder->_state = CHUNKED_IN_CHUNK_SIZE; break; case CHUNKED_IN_TRAILERS_LINE_HEAD: for (;; ++src) { if (src == bufsz) goto Exit; if (buf[src] != '\015') break; } if (buf[src++] == '\012') goto Complete; decoder->_state = CHUNKED_IN_TRAILERS_LINE_MIDDLE; /* fallthru */ case CHUNKED_IN_TRAILERS_LINE_MIDDLE: for (;; ++src) { if (src == bufsz) goto Exit; if (buf[src] == '\012') break; } ++src; decoder->_state = CHUNKED_IN_TRAILERS_LINE_HEAD; break; default: assert(!"decoder is corrupt"); } } Complete: ret = bufsz - src; Exit: if (dst != src) memmove(buf + dst, buf + src, bufsz - src); *_bufsz = dst; return ret; } int phr_decode_chunked_is_in_data(struct phr_chunked_decoder *decoder) { return decoder->_state == CHUNKED_IN_CHUNK_DATA; } #undef CHECK_EOF #undef EXPECT_CHAR #undef ADVANCE_TOKEN