From 236f7399fe23b68d65287f3949a99da197f2dd16 Mon Sep 17 00:00:00 2001 From: Nicholas Orlowsky Date: Tue, 4 Feb 2025 18:27:33 -0500 Subject: [PATCH] request parser rewrite + added tests * rewrote request parser, now more simplified and theoretically faster * added gtest and an example test to measure parser times --- CMakeLists.txt | 37 +++++- lib/http/request.cpp | 191 ++++++++++++++--------------- lib/http/request.hpp | 23 ++-- tests/speed_tests.cpp | 28 +++++ tests/test_files/test_request.http | 13 ++ 5 files changed, 184 insertions(+), 108 deletions(-) create mode 100644 tests/speed_tests.cpp create mode 100644 tests/test_files/test_request.http diff --git a/CMakeLists.txt b/CMakeLists.txt index 273dfc8..949bb34 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,7 @@ add_custom_target(build-version add_custom_target(build-supplemental COMMAND cd ../build_supp && python3 ./error_gen.py - COMMAND mkdir www && cp -r ../default_www/regular/* ./www/ + COMMAND mkdir -p www && cp -r ../default_www/regular/* ./www/ DEPENDS build_supp/version.txt ../default_www/regular/* build_supp/error_gen.py build-version COMMENT "Generated supplemental build files (default www dir + error pages)" ) @@ -38,3 +38,38 @@ add_dependencies(anthracite-bin anthracite) add_executable(anthracite-api-bin src/api_main.cpp) target_link_libraries(anthracite-api-bin anthracite) +include(FetchContent) +FetchContent_Declare( + googletest + URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip +) + +FetchContent_MakeAvailable(googletest) + +file(GLOB TESTS_SRC CONFIGURE_DEPENDS "tests/*.cpp") +enable_testing() + +add_custom_target(test_files + COMMAND cp -r ../tests/test_files . + DEPENDS ../tests/test_files/* + COMMENT "Copied test resource files" +) + +add_executable( + tests + ${TESTS_SRC} +) +add_dependencies(tests anthracite) +add_dependencies(tests test_files) + +target_link_libraries( + tests + GTest::gtest_main +) +target_link_libraries( + tests + anthracite +) + +include(GoogleTest) +gtest_discover_tests(tests) diff --git a/lib/http/request.cpp b/lib/http/request.cpp index 0d1bc2a..ea688b0 100644 --- a/lib/http/request.cpp +++ b/lib/http/request.cpp @@ -5,115 +5,109 @@ namespace anthracite::http { +void request::parse_header(std::string& raw_line) { + auto delim_pos = raw_line.find_first_of(':'); + auto value_pos = raw_line.find_first_not_of(' ', delim_pos+1); + + std::string header_name = raw_line.substr(0,delim_pos); + std::string header_val = raw_line.substr(value_pos); + + _headers[header_name] = header(header_name, header_val); +} + +void request::parse_query_param(std::string& raw_param) { + auto delim_pos = raw_param.find_first_of('='); + auto value_pos = delim_pos+1; + + std::string query_name = raw_param.substr(0,delim_pos); + std::string query_val = raw_param.substr(value_pos); + + _query_params[query_name] = query_param(query_name, query_val); +} + +void request::parse_path(std::string& raw_path) { + std::stringstream ss(raw_path); + std::string tok; + + if (getline(ss, tok, '?')){ + _path = tok; + } + + while(getline(ss, tok, '&')) { + parse_query_param(tok); + } +} + +void request::parse_request_line(std::string& raw_line) { + request_line_parser_state state = METHOD; + std::stringstream ss(raw_line); + std::string tok; + while(getline(ss, tok, ' ')){ + switch(state) { + case METHOD: { + auto search = method_map.find(tok); + if (search != method_map.end()) { + _method = search->second; + } else { + _method = method::UNKNOWN; + } + + state = PATH; + break; + }; + + case PATH: { + parse_path(tok); + state = VERSION; + break; + }; + + case VERSION: { + auto search = version_map.find(tok); + if (search != version_map.end()) { + _http_version = search->second; + } else { + _http_version = version::HTTP_1_0; + } + return; + }; + } + } +} + request::request(std::string& raw_data, const std::string& client_ip) : _path("") , _client_ipaddr(client_ip) { - parser_state state = METHOD; + parser_state state = REQUEST_LINE; - std::string scratch = ""; - std::string scratch_2 = ""; - for (int i = 0; i < raw_data.length(); i++) { - switch (state) { - case METHOD: { - if (raw_data[i] == ' ') { - if (method_map.find(scratch) == method_map.end()) { - _method = method::UNKNOWN; + std::stringstream line_stream(raw_data); + std::string line; + + while(getline(line_stream, line, '\n') && state != BODY_CONTENT){ + line.pop_back(); // HTTP requests do newline as \r\n, this removes the \r + switch(state) { + case REQUEST_LINE: { + parse_request_line(line); + state = HEADERS; + break; + }; + case HEADERS: { + if (line.length() == 0) { + state = BODY_CONTENT; } else { - _method = method_map.find(scratch)->second; + parse_header(line); } - scratch = ""; - state = PATH; - } else { - scratch += raw_data[i]; - } - } break; - - case PATH: { - switch (raw_data[i]) { - case ' ': - state = VERSION; break; - case '?': - state = QUERY_PARAM_NAME; - break; - default: - _path += raw_data[i]; - break; - } - } break; - - case QUERY_PARAM_NAME: { - if (raw_data[i] == ' ') { - scratch = ""; - state = VERSION; - } else if (raw_data[i] == '=') { - state = QUERY_PARAM_VALUE; - } else { - scratch += raw_data[i]; - } - } break; - - case QUERY_PARAM_VALUE: { - if (raw_data[i] == ' ') { - _query_params[scratch] = query_param(scratch, scratch_2); - scratch = ""; - scratch_2 = ""; - state = VERSION; - } else if (raw_data[i] == '&') { - _query_params[scratch] = query_param(scratch, scratch_2); - scratch = ""; - scratch_2 = ""; - state = QUERY_PARAM_NAME; - } else { - scratch_2 += raw_data[i]; - } - } break; - - case VERSION: { - if (raw_data[i] == '\n') { - _http_version = version_map.find(scratch)->second; - scratch = ""; - state = HEADER_NAME; - } else if (raw_data[i] != '\r') { - scratch += raw_data[i]; - } - } break; - - case HEADER_NAME: { - if (raw_data[i] == '\n') { - scratch = ""; - scratch_2 = ""; - state = BODY_CONTENT; - break; - } else if (raw_data[i] == ' ') { - scratch = ""; - break; - } else if (raw_data[i] == ':') { - state = HEADER_VALUE; - i++; - } else { - scratch += raw_data[i]; - } - } break; - - case HEADER_VALUE: { - if (raw_data[i] == '\n') { - _headers[scratch] = header(scratch, scratch_2); - scratch = ""; - scratch_2 = ""; - state = HEADER_NAME; - } else if (raw_data[i] != '\r') { - scratch_2 += raw_data[i]; - } - } break; - - case BODY_CONTENT: { - _body_content += raw_data[i]; - } break; + }; + case BODY_CONTENT: break; } } + + if (getline(line_stream, line, '\0')) { + _body_content = line; + } } std::string request::path() { return _path; } @@ -129,7 +123,6 @@ version request::get_http_version() bool request::is_supported_version() { - // log::err << reverse_version_map.find(_http_version)->second << std::endl; return _http_version == HTTP_1_1 || _http_version == HTTP_1_0; } diff --git a/lib/http/request.hpp b/lib/http/request.hpp index 7d6855b..036fc9f 100644 --- a/lib/http/request.hpp +++ b/lib/http/request.hpp @@ -9,14 +9,16 @@ namespace anthracite::http { class request { private: - enum parser_state { METHOD, - PATH, - QUERY_PARAM_NAME, - QUERY_PARAM_VALUE, - VERSION, - HEADER_NAME, - HEADER_VALUE, - BODY_CONTENT }; + enum request_line_parser_state { + METHOD, PATH, VERSION + }; + + enum parser_state { + REQUEST_LINE, + HEADERS, + BODY_CONTENT + }; + method _method; version _http_version; std::string _path; @@ -24,6 +26,11 @@ private: std::string _body_content; std::unordered_map _headers; // kinda goofy, whatever std::unordered_map _query_params; // kinda goofy, whatever + // + void parse_request_line(std::string& raw_line); + void parse_header(std::string& raw_line); + void parse_path(std::string& raw_path); + void parse_query_param(std::string& raw_param); public: request(std::string& raw_data, const std::string& client_ip); diff --git a/tests/speed_tests.cpp b/tests/speed_tests.cpp new file mode 100644 index 0000000..58994f3 --- /dev/null +++ b/tests/speed_tests.cpp @@ -0,0 +1,28 @@ +#include +#include +#include +#include "../lib/http/request.hpp" + + +TEST(speed_tests, request_parse) { + using std::chrono::high_resolution_clock; + using std::chrono::duration_cast; + using std::chrono::duration; + using std::chrono::milliseconds; + + std::ifstream t("./test_files/test_request.http"); + std::stringstream buffer; + buffer << t.rdbuf(); + std::string raw_req = buffer.str(); + + auto t1 = high_resolution_clock::now(); + for(int i = 0; i < 1000000; ++i) { + volatile anthracite::http::request req (raw_req, "0.0.0.0"); + } + auto t2 = high_resolution_clock::now(); + + /* Getting number of milliseconds as an integer. */ + auto ms_int = duration_cast(t2 - t1); + + std::cout << "Parsed 1 Million requests in " << ms_int << "ms" << std::endl; +} diff --git a/tests/test_files/test_request.http b/tests/test_files/test_request.http new file mode 100644 index 0000000..4f79472 --- /dev/null +++ b/tests/test_files/test_request.http @@ -0,0 +1,13 @@ +GET /foo/bar?test=a&test2=b HTTP/1.1 +Host: example.org +User-Agent: Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; fr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 +Accept: */* +Accept-Language: fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3 +Accept-Encoding: gzip,deflate +Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7 +Keep-Alive: 115 +Connection: keep-alive +Content-Type: application/x-www-form-urlencoded +X-Requested-With: XMLHttpRequest +Referer: http://example.org/test +Cookie: foo=bar; lorem=ipsum;