From cd3cc5476772d47f1228bcc9cd486a392f5af88e Mon Sep 17 00:00:00 2001 From: Shriram Ravindranathan Date: Sun, 20 Jul 2025 15:28:08 +0530 Subject: [PATCH] Ignore whitespace during HTML parsing The HTML parser now correctly ignores leading whitespace before the and tags, as well as whitespace within and after the section. This improves parsing robustness for common HTML formatting variations. - Add a new test suite (WhitespaceTest.cpp) with specific unit tests to verify this behavior. - Update the CMake configuration to include the new test file in the build. --- src/Parser.cpp | 14 +++++++++++++- tests/CMakeLists.txt | 2 +- tests/WhitespaceTest.cpp | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) create mode 100644 tests/WhitespaceTest.cpp diff --git a/src/Parser.cpp b/src/Parser.cpp index a995919..f13458a 100644 --- a/src/Parser.cpp +++ b/src/Parser.cpp @@ -119,6 +119,9 @@ namespace HtmlParser void Parser::InsertionModeBeforeHtml(const Token& Token) { + if (Token.Type == TokenType::Character && isspace(Token.Data[0])) { + return; // Ignore whitespace + } if (Token.Type == TokenType::StartTag && Utils::ToLower(Token.Data) == "html") { InsertElement(Token); @@ -138,6 +141,9 @@ namespace HtmlParser void Parser::InsertionModeBeforeHead(const Token& Token) { + if (Token.Type == TokenType::Character && isspace(Token.Data[0])) { + return; // Ignore whitespace + } if (Token.Type == TokenType::StartTag && Utils::ToLower(Token.Data) == "head") { InsertElement(Token); @@ -157,6 +163,9 @@ namespace HtmlParser void Parser::InsertionModeInHead(const Token& Token) { + if (Token.Type == TokenType::Character && isspace(Token.Data[0])) { + return; // Ignore whitespace + } if (Token.Type == TokenType::EndTag && Utils::ToLower(Token.Data) == "head") { OpenElements.pop_back(); @@ -173,6 +182,9 @@ namespace HtmlParser void Parser::InsertionModeAfterHead(const Token& Token) { + if (Token.Type == TokenType::Character && isspace(Token.Data[0])) { + return; // Ignore whitespace + } if (Token.Type == TokenType::StartTag && Utils::ToLower(Token.Data) == "body") { InsertElement(Token); @@ -209,4 +221,4 @@ namespace HtmlParser // Handle other token types as needed } } -} // namespace HtmlParser \ No newline at end of file +} // namespace HtmlParser diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index bae8b39..5dfb269 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,4 +1,4 @@ -add_executable(RunTests ParserTest.cpp DOMTest.cpp DOMStrictTest.cpp QueryTest.cpp DOMToHtmlTest.cpp QueryAdvancedTest.cpp) +add_executable(RunTests ParserTest.cpp DOMTest.cpp DOMStrictTest.cpp QueryTest.cpp DOMToHtmlTest.cpp QueryAdvancedTest.cpp WhitespaceTest.cpp) target_link_libraries(RunTests HtmlParser gtest gtest_main) add_test(NAME ParserTest COMMAND RunTests) diff --git a/tests/WhitespaceTest.cpp b/tests/WhitespaceTest.cpp new file mode 100644 index 0000000..60d7ffa --- /dev/null +++ b/tests/WhitespaceTest.cpp @@ -0,0 +1,36 @@ +#include +#include +#include + +TEST(ParserWhitespaceTest, WhitespaceBeforeHtml) +{ + HtmlParser::Parser Parser; + HtmlParser::DOM DOM = Parser.Parse(" \n\t "); + auto Body = DOM.GetElementsByTagName("body"); + ASSERT_EQ(Body.size(), 1); +} + +TEST(ParserWhitespaceTest, WhitespaceBeforeHead) +{ + HtmlParser::Parser Parser; + HtmlParser::DOM DOM = Parser.Parse(" \n\t "); + auto Head = DOM.GetElementsByTagName("head"); + ASSERT_EQ(Head.size(), 1); +} + +TEST(ParserWhitespaceTest, WhitespaceInHead) +{ + HtmlParser::Parser Parser; + HtmlParser::DOM DOM = Parser.Parse(" \n\t "); + auto Head = DOM.GetElementsByTagName("head"); + ASSERT_EQ(Head.size(), 1); +} + +TEST(ParserWhitespaceTest, WhitespaceAfterHead) +{ + HtmlParser::Parser Parser; + HtmlParser::DOM DOM = Parser.Parse(" \n\t "); + auto Body = DOM.GetElementsByTagName("body"); + ASSERT_EQ(Body.size(), 1); +} +