& tokens);
-
-/**
- * @brief Represents a node in the HTML Document Object Model (DOM) tree.
- * Can be an element node (e.g., ), a text node (innerText), or the root.
- */
-class Node {
-public:
- std::string tagName; // The tag name of the element (e.g., "div", "h1")
- std::string innerText; // Text content directly inside this node
- std::map attributes; // Map of attribute key-value pairs
- std::vector> children; // Vector of unique pointers to child nodes
- Node* parent; // Raw pointer to the parent node (ownership managed by unique_ptr in children)
-
- /**
- * @brief Constructor for Node. Initializes parent to nullptr.
- */
- Node() : parent(nullptr) {}
-
- // --- Setters ---
- void setTagName(const std::string& name) { tagName = name; }
- void setInnerText(const std::string& text) { innerText = text; }
- void setAttribute(const std::string& key, const std::string& value) { attributes[key] = value; }
-
- // --- Getters ---
- const std::string& getTagName() const { return tagName; }
- const std::string& getInnerText() const { return innerText; }
- const std::map& getAttributes() const { return attributes; }
-
- /**
- * @brief Creates a new child node for the current node.
- * Sets the new node's tag name and its parent pointer to `this`.
- * Transfers ownership of the new node to the `children` vector.
- * @param name The tag name of the new element (e.g., "p", "a").
- * @return A raw pointer to the newly created child node.
- */
- Node* createElement(const std::string& name) {
- std::unique_ptr newNode = std::make_unique();
- newNode->setTagName(name);
- newNode->parent = this; // Set parent pointer for the new node
- Node* rawPtr = newNode.get(); // Get raw pointer before moving ownership
- children.push_back(std::move(newNode)); // Transfer ownership to the children vector
- return rawPtr;
- }
-
- /**
- * @brief Prints the HTML tree structure recursively for debugging purposes.
- * Includes indentation for better readability.
- * @param indent The current indentation level.
- */
- void print(int indent = 0) const {
- // Print indentation for the current node
- for (int i = 0; i < indent; ++i) std::cout << " ";
-
- // Print the opening tag and its attributes
- std::cout << "<" << tagName;
- for (const auto& attr : attributes) {
- std::cout << " " << attr.first << "=\"" << attr.second << "\"";
- }
- std::cout << ">";
-
- // Print inner text if it exists
- if (!innerText.empty()) {
- std::cout << innerText;
- }
-
- // Determine if a closing tag is required.
- // If the node has children or inner text, it definitely needs a closing tag.
- // Even for empty tags (like with no content), a closing tag is required.
- bool requiresClosingTag = true; // Assuming all elements created require a closing tag
- // unless explicitly handled as self-closing in parser.
-
- // If the node has content (text or children), print a newline for better formatting.
- // Then recursively print children, followed by the closing tag.
- if (!innerText.empty() || !children.empty()) {
- std::cout << std::endl; // Newline after opening tag if content/children follow
-
- // Recursively print all child nodes
- for (const auto& child : children) {
- child->print(indent + 1);
- }
-
- // Print indentation for the closing tag
- for (int i = 0; i < indent; ++i) std::cout << " ";
- std::cout << "" << tagName << ">" << std::endl;
- } else if (requiresClosingTag) {
- // For truly empty tags (e.g., ), print newline and then its closing tag
- std::cout << std::endl;
- for (int i = 0; i < indent; ++i) std::cout << " ";
- std::cout << "" << tagName << ">" << std::endl;
- } else {
- // This case handles conceptual self-closing for elements that don't need a closing tag
- // (e.g.
). In the parser, `currentNode` immediately moves up for these.
- // So, for the `print` function, if it reaches this branch, it means the node was effectively "closed".
- std::cout << std::endl; // Just print a newline after the self-closing tag for formatting.
- }
- }
-};
-
-/**
- * @brief Helper function to parse attributes from a vector of tokens.
- * It expects tokens to be either `key=value` (with or without quotes) or just `key` (for boolean attributes).
- * @param tokens A vector of strings, where the first element is the tag name, and subsequent elements are attributes.
- * @return A map of attribute keys to their values.
- */
-std::map parseAttributes(const std::vector& tokens) {
- std::map attrs;
- if (tokens.empty()) return attrs;
-
- // Skip the first token which is the tag name itself
- for (size_t i = 1; i < tokens.size(); ++i) {
- const std::string& token = tokens[i];
- size_t eqPos = token.find('='); // Find the position of '='
-
- if (eqPos != std::string::npos) {
- // Attribute is in `key=value` format
- std::string key = token.substr(0, eqPos);
- std::string value = token.substr(eqPos + 1);
-
- // Remove quotes if the value is enclosed in them (e.g., value="my value")
- if (!value.empty() && (value.front() == '"' || value.front() == '\'')) {
- if (value.length() >= 2 && value.back() == value.front()) { // Ensure matching quotes
- value = value.substr(1, value.length() - 2);
- }
- }
- attrs[key] = value;
- } else {
- // Attribute is a boolean attribute or a key without an explicit value (e.g., `disabled`)
- attrs[token] = "";
- }
- }
- return attrs;
-}
-
-/**
- * @brief Core HTML parsing logic that reads character by character from an input stream.
- * @param inputStream The input stream (e.g., `ifstream` for file, `stringstream` for string).
- * @return A unique_ptr to the root Node of the parsed HTML tree.
- */
-std::unique_ptr parseHtmlStream(std::istream& inputStream) {
- std::unique_ptr root = std::make_unique();
- root->setTagName("root"); // Create a conceptual root node for the document
-
- Node* currentNode = root.get(); // Pointer to the currently active node in the tree
-
- std::string currentTokenPart = ""; // Buffer for collecting tag names, attribute keys, or partial attribute values
- std::string textBuffer = ""; // Buffer for collecting inner text content
-
- bool inTag = false; // True when parser is inside '<' and '>' of a tag
- bool inClosingTag = false; // True when parsing a closing tag (e.g., after ``)
- bool inAttributeValue = false; // True when parsing a quoted attribute value (e.g., `value="...")
- char attributeQuoteType = ' '; // Stores the type of quote (`'` or `"`) used for the attribute value
-
- std::vector tagTokens; // Stores the tag name and all its attributes as individual tokens
-
- char current; // Current character being processed
-
- // Loop through the input stream character by character
- while (inputStream.get(current)) {
- if (inTag) {
- // --- Parser is currently inside an HTML tag (`<...>` or ``) ---
- if (inAttributeValue) {
- // Currently parsing content within an attribute's quoted value
- if (current == attributeQuoteType) {
- // End of the attribute value (closing quote found)
- inAttributeValue = false;
- tagTokens.push_back(currentTokenPart); // Add the complete attribute value to tokens
- currentTokenPart = ""; // Reset buffer for next part
- } else {
- currentTokenPart += current; // Continue accumulating the attribute value
- }
- } else if (current == ' ' || current == '\t' || current == '\n' || current == '\r') {
- // Whitespace: separates tag name, attributes, or self-closing markers
- if (!currentTokenPart.empty()) {
- tagTokens.push_back(currentTokenPart); // Add the accumulated part (tag name or attribute key/value)
- currentTokenPart = ""; // Reset buffer
- }
- } else if (current == '=' && !inClosingTag) {
- // Encountered an equals sign (for `key=value` attributes)
- if (!currentTokenPart.empty()) {
- tagTokens.push_back(currentTokenPart); // Push the attribute key
- currentTokenPart = "";
- }
- currentTokenPart += current; // Add the '=' to the current part (it will be combined with value later)
- } else if (current == '"' || current == '\'') {
- // Start of an attribute value (quoted)
- // If there's an accumulated part, it should be the key or `key=`
- if (!currentTokenPart.empty()) {
- tagTokens.push_back(currentTokenPart);
- }
- currentTokenPart = ""; // Clear for the actual value
- inAttributeValue = true;
- attributeQuoteType = current; // Store the quote type
- } else if (current == '/') {
- // Encountered a slash (`/`). Could be for:
- // 1. Self-closing tag (e.g., `
`, ``)
- // 2. Start of a closing tag (e.g., `
') {
- // --- End of a tag (`>` found) ---
- inTag = false; // Exiting the tag processing state
-
- // If there's any remaining accumulated part, add it to tokens
- if (!currentTokenPart.empty()) {
- tagTokens.push_back(currentTokenPart);
- currentTokenPart = "";
- }
-
- if (inClosingTag) {
- // This was a closing tag (e.g., ``)
- if (!tagTokens.empty()) {
- std::string closingTagName = tagTokens[0];
- // Trim trailing '/' if it was parsed as part of the tag name (e.g. `br/` due to tokenization)
- if (!closingTagName.empty() && closingTagName.back() == '/') {
- closingTagName.pop_back();
- }
-
- // Check if the closing tag matches the current node's tag name
- if (currentNode->getTagName() == closingTagName) {
- if (currentNode->parent) {
- currentNode = currentNode->parent; // Move up to the parent node
- } else {
- // Error: Tried to close the root node or unmatched closing tag without a parent
- std::cerr << "Warning: Attempted to close root tag or unmatched closing tag: " << closingTagName << ">" << std::endl;
- }
- } else {
- // Mismatched closing tag (e.g., `
`)
- std::cerr << "Error: Mismatched closing tag. Expected " << currentNode->getTagName()
- << ">, got " << closingTagName << ">" << std::endl;
- // For robustness, you might choose to still move up or try to find a matching ancestor.
- // For this simple parser, we'll still attempt to move up, assuming it's a parse error.
- if (currentNode->parent) {
- currentNode = currentNode->parent;
- }
- }
- } else {
- std::cerr << "Warning: Empty closing tag detected (e.g., >)!" << std::endl;
- }
- inClosingTag = false; // Reset the closing tag state
- } else {
- // This was an opening tag (e.g., ``, ``)
- // Or a self-closing tag (e.g., `
`, `
![]()
`)
- if (!tagTokens.empty()) {
- std::string tagName = tagTokens[0]; // The first token is the tag name
- bool isSelfClosing = false;
-
- // Check if the tag is self-closing
- // 1. If the tag name itself ends with '/' (e.g., `
` where `br/` is the token)
- if (!tagName.empty() && tagName.back() == '/') {
- isSelfClosing = true;
- tagName.pop_back(); // Remove the '/' from the tag name
- }
- // 2. If the last token is just "/" (e.g., `
` where `/` is a separate token)
- else if (!tagTokens.empty() && tagTokens.back() == "/") {
- isSelfClosing = true;
- tagTokens.pop_back(); // Remove the "/" token from the list
- }
-
- // Create the new node as a child of the current node
- Node* newNode = currentNode->createElement(tagName);
- // Parse attributes from the collected tokens
- auto attrs = parseAttributes(tagTokens);
- for (auto const& pair : attrs) {
- newNode->setAttribute(pair.first, pair.second);
- }
- currentNode = newNode; // Move `currentNode` down to the newly created element
-
- if (isSelfClosing) {
- // For self-closing tags, immediately move back up to the parent
- // as they do not contain children or inner text.
- if (currentNode->parent) {
- currentNode = currentNode->parent;
- }
- }
- } else {
- std::cerr << "Warning: Empty or malformed opening tag detected: < >" << std::endl;
- }
- }
- tagTokens.clear(); // Clear tokens for the next tag
- textBuffer = ""; // Clear any accumulated text before a new tag or content starts
- } else {
- // Accumulate characters for the tag name or an attribute key/value part
- currentTokenPart += current;
- }
- } else {
- // --- Parser is currently outside a tag, processing text content or looking for new tags ---
- if (current == '<') {
- // A new tag is starting.
- // If there's accumulated text, assign it as innerText to the current node
- if (!textBuffer.empty()) {
- // Trim leading/trailing whitespace/newlines from the text content
- size_t first = textBuffer.find_first_not_of(" \t\n\r");
- size_t last = textBuffer.find_last_not_of(" \t\n\r");
- if (std::string::npos != first) {
- currentNode->setInnerText(textBuffer.substr(first, (last - first + 1)));
- }
- textBuffer = ""; // Reset text buffer for next content
- }
-
- // Peek at the next character to determine the type of tag
- char nextChar;
- if (inputStream.get(nextChar)) { // Attempt to read the next character without consuming it fully yet
- if (nextChar == '/') {
- // It's a closing tag (e.g., `
`) or DOCTYPE (``)
- // Simple skipping logic: read until the next `>`
- while (inputStream.get(current) && current != '>');
- continue; // Continue to the next character in the main loop
- } else {
- // It's a regular opening tag (e.g., ``)
- inTag = true; // Enter the tag parsing state
- currentTokenPart += nextChar; // Add the actual tag name character
- }
- } else {
- // Unexpected end of file after '<'
- std::cerr << "Warning: Unexpected EOF after '<'" << std::endl;
- break; // Exit parsing loop
- }
- } else {
- // Accumulate characters as inner text content for the current node
- textBuffer += current;
- }
- }
- }
-
- // After the loop, if there's any remaining text in the buffer, assign it
- if (!textBuffer.empty()) {
- size_t first = textBuffer.find_first_not_of(" \t\n\r");
- size_t last = textBuffer.find_last_not_of(" \t\n\r");
- if (std::string::npos != first) {
- currentNode->setInnerText(textBuffer.substr(first, (last - first + 1)));
- }
- }
-
- return root; // Return the root of the parsed HTML tree
-}
-
-/**
- * @brief Parses an HTML file and constructs a DOM tree.
- * @param file The path to the HTML file.
- * @return A unique_ptr to the root Node of the parsed HTML tree, or nullptr if the file cannot be opened.
- */
-std::unique_ptr
parserdocument(std::string file) {
- std::ifstream inputFile(file); // Open the specified file
-
- if (!inputFile.is_open()) {
- std::cerr << "Unable to open: " << file << std::endl;
- return nullptr; // Return nullptr if file opening fails
- }
-
- return parseHtmlStream(inputFile); // Use the core parsing logic
-}
-
-
-// --- Main function for demonstration and testing ---
-int main() {
- // Test Case 1: Empty tag
- std::cout << "--- Test Case 1: Empty tag ---" << std::endl;
- std::string html1 = "";
- std::stringstream ss1(html1);
- std::unique_ptr root1 = parseHtmlStream(ss1);
- if (root1) {
- root1->print();
- }
- std::cout << "\n";
-
- // Test Case 2: Tag with text content Hello World
- std::cout << "--- Test Case 2: Tag with text content Hello World
---" << std::endl;
- std::string html2 = "Hello World
";
- std::stringstream ss2(html2);
- std::unique_ptr root2 = parseHtmlStream(ss2);
- if (root2) {
- root2->print();
- }
- std::cout << "\n";
-
- // Test Case 3: Nested tags with mixed content
- std::cout << "--- Test Case 3: Nested tags with mixed content ---" << std::endl;
- std::string html3 = "This is bold text.
";
- std::stringstream ss3(html3);
- std::unique_ptr root3 = parseHtmlStream(ss3);
- if (root3) {
- root3->print();
- }
- std::cout << "\n";
-
- // Test Case 4: Tags with attributes and self-closing tags
- std::cout << "--- Test Case 4: Tags with attributes and self-closing tags ---" << std::endl;
- std::string html4 = "
";
- std::stringstream ss4(html4);
- std::unique_ptr root4 = parseHtmlStream(ss4);
- if (root4) {
- root4->print();
- }
- std::cout << "\n";
-
- // Test Case 5: Empty tag with whitespace between
- std::cout << "--- Test Case 5: Empty tag with whitespace between
---" << std::endl;
- std::string html5 = "
";
- std::stringstream ss5(html5);
- std::unique_ptr root5 = parseHtmlStream(ss5);
- if (root5) {
- root5->print();
- }
- std::cout << "\n";
-
- // Test Case 6: Malformed closing tag
- std::cout << "--- Test Case 6: Malformed closing tag (mismatch) ---" << std::endl;
- std::string html6 = "
"; // Mismatched closing tag
- std::stringstream ss6(html6);
- std::unique_ptr root6 = parseHtmlStream(ss6);
- if (root6) {
- root6->print();
- }
- std::cout << "\n";
-
- // Test Case 7: Comment and DOCTYPE
- std::cout << "--- Test Case 7: Comment and DOCTYPE ---" << std::endl;
- std::string html7 = "\nContent";
- std::stringstream ss7(html7);
- std::unique_ptr root7 = parseHtmlStream(ss7);
- if (root7) {
- root7->print();
- }
- std::cout << "\n";
-
- // Example of using parserdocument function with a file (requires a test.html file)
- // std::cout << "--- Test Case 8: Parsing from file (test.html) ---" << std::endl;
- // // Create a dummy test.html file for this test
- // std::ofstream outfile("test.html");
- // outfile << "Hello from file!
";
- // outfile.close();
- //
- // std::unique_ptr root8 = parserdocument("test.html");
- // if (root8) {
- // root8->print();
- // } else {
- // std::cout << "Failed to parse test.html" << std::endl;
- // }
-
- return 0;
-}