|
|
|
@ -98,11 +98,13 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
@@ -98,11 +98,13 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
|
|
|
|
|
else if (html[cursor] == '<') { |
|
|
|
|
// HTML comments
|
|
|
|
|
if (html[cursor + 1] == '!' && html[cursor + 2] == '-' && html[cursor + 3] == '-' ) { |
|
|
|
|
//std::cout << "HTMLParser::Parse - starting HTML comment at " << cursor << std::endl;
|
|
|
|
|
state = 4; |
|
|
|
|
} |
|
|
|
|
// close tag
|
|
|
|
|
else if (html[cursor + 1] == '/') { |
|
|
|
|
// start closing tag
|
|
|
|
|
//std::cout << "HTMLParser::Parse - starting closing tag at " << html.substr(cursor, 7) << std::endl;
|
|
|
|
|
if (currentNode && currentNode->parent) { |
|
|
|
|
// we should snap to the level we started at (as we maybe a couple levels deep <ul><li></ul>
|
|
|
|
|
// but it's the matching part of this tag
|
|
|
|
@ -111,7 +113,6 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
@@ -111,7 +113,6 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
|
|
|
|
|
std::cout << "HTMLParser::Parse - currentNode/parent is null - close tag" << std::endl; |
|
|
|
|
} |
|
|
|
|
state = 1; // ignore closing tags
|
|
|
|
|
//starts.push_back(cursor);
|
|
|
|
|
} |
|
|
|
|
// these have never have a closing tag
|
|
|
|
|
else if ( |
|
|
|
@ -123,6 +124,7 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
@@ -123,6 +124,7 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
|
|
|
|
|
(html[cursor + 1] == 'm' && html[cursor + 2] == 'e' && html[cursor + 3] == 't' && html[cursor + 4] == 'a') || |
|
|
|
|
(html[cursor + 1] == 'i' && html[cursor + 2] == 'n' && html[cursor + 3] == 'p' && html[cursor + 4] == 'u' && html[cursor + 5] == 't') |
|
|
|
|
) { |
|
|
|
|
//std::cout << "HTMLParser::Parse - Starting single tag " << html.substr(cursor, 6) << std::endl;
|
|
|
|
|
std::shared_ptr<TagNode> tagNode = std::make_shared<TagNode>(); |
|
|
|
|
if (currentNode) { |
|
|
|
|
currentNode->children.push_back(tagNode); |
|
|
|
@ -138,7 +140,7 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
@@ -138,7 +140,7 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
|
|
|
|
|
cursor ++; |
|
|
|
|
} else { |
|
|
|
|
std::string element = html.substr(cursor, closeTagPos + 2); |
|
|
|
|
//std::cout << "creating element, tag: " << element << std::endl;
|
|
|
|
|
//std::cout << "HTMLParser::Parse - creating element, tag: " << element << std::endl;
|
|
|
|
|
parseTag(element, *dynamic_cast<TagNode*>(currentNode.get())); |
|
|
|
|
cursor += 2 + closeTagPos; |
|
|
|
|
} |
|
|
|
@ -155,6 +157,7 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
@@ -155,6 +157,7 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
|
|
|
|
|
} |
|
|
|
|
// start tag (<bob> <bob part)
|
|
|
|
|
else { |
|
|
|
|
//std::cout << "HTMLParser::Parse - start oc tag " << html.substr(cursor, 6) << std::endl;
|
|
|
|
|
std::shared_ptr<TagNode> tagNode = std::make_shared<TagNode>(); |
|
|
|
|
if (currentNode) { |
|
|
|
|
currentNode->children.push_back(tagNode); |
|
|
|
@ -168,6 +171,7 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
@@ -168,6 +171,7 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
else { // start text node
|
|
|
|
|
//std::cout << "HTMLParser::Parse - start text node " << html.substr(cursor, 6) << std::endl;
|
|
|
|
|
std::shared_ptr<TextNode> textNode = std::make_shared<TextNode>(); |
|
|
|
|
// not sure why currentNode is null but it is
|
|
|
|
|
if (currentNode) { |
|
|
|
@ -182,17 +186,16 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
@@ -182,17 +186,16 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
|
|
|
|
|
} |
|
|
|
|
cursor--; |
|
|
|
|
} |
|
|
|
|
else if (state == 1) { // Skip Over Element
|
|
|
|
|
else if (state == 1) { // Skip Over Element (used by closing tag)
|
|
|
|
|
if (html[cursor] == '>') { |
|
|
|
|
//std::string element = html.substr(starts.back(), cursor - starts.back() + 1);
|
|
|
|
|
//starts.pop_back();
|
|
|
|
|
//std::cout << "HTMLParser::parse - close tag: " << element << std::endl;
|
|
|
|
|
//std::cout << "HTMLParser::parse - close tag: " << html.substr(starts.back(), cursor - starts.back() + 1) << std::endl;
|
|
|
|
|
state = 0; |
|
|
|
|
prependWhiteSpace = false; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
else if (state == 4) { // HTML Comment
|
|
|
|
|
if (html[cursor] == '-' && html[cursor + 1] == '-' && html[cursor + 2] == '>') { |
|
|
|
|
//std::cout << "HTMLParser::Parse - Found end HTML comment at " << html.substr(cursor, 6) << std::endl;
|
|
|
|
|
state = 0; |
|
|
|
|
cursor += 2; // advance cursor to end of comment
|
|
|
|
|
prependWhiteSpace = false; |
|
|
|
@ -201,7 +204,7 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
@@ -201,7 +204,7 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
|
|
|
|
|
else if (state == 2) { // Search for end tag node
|
|
|
|
|
if (html[cursor] == '>') { // end tag node
|
|
|
|
|
std::string element = html.substr(starts.back(), cursor - starts.back() + 1); |
|
|
|
|
//std::cout << "HTMLParser::parse - close tag: " << element << std::endl;
|
|
|
|
|
//std::cout << "HTMLParser::parse - end open tag: " << element << std::endl;
|
|
|
|
|
if (element == "<li>") { |
|
|
|
|
// this will close previous li before starting a new one
|
|
|
|
|
autoCloseTag(currentNode, rootNode, "ul", "li"); |
|
|
|
@ -221,6 +224,7 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
@@ -221,6 +224,7 @@ std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
|
|
|
|
|
else if (state == 3) { // End text node
|
|
|
|
|
if (html[cursor + 1] == '<') { |
|
|
|
|
dynamic_cast<TextNode*>(currentNode.get())->text = (prependWhiteSpace?" ":"") + html.substr(starts.back(), cursor - starts.back() + 1); |
|
|
|
|
//std::cout << "HTMLParser::parse - end text node: " << html.substr(starts.back(), cursor - starts.back() + 1) << std::endl;
|
|
|
|
|
starts.pop_back(); |
|
|
|
|
if (currentNode && currentNode->parent) { |
|
|
|
|
currentNode = currentNode->parent; |
|
|
|
@ -273,6 +277,8 @@ void HTMLParser::parseTag(const std::string &element, TagNode &tagNode) const {
@@ -273,6 +277,8 @@ void HTMLParser::parseTag(const std::string &element, TagNode &tagNode) const {
|
|
|
|
|
} |
|
|
|
|
else if (state == 3) { |
|
|
|
|
if (element[cursor] == '"') { |
|
|
|
|
// was suggested to use tagNode.properties[propertyKey] = element.substr(start, cursor - start);
|
|
|
|
|
// for better readabiilty
|
|
|
|
|
tagNode.properties.insert(std::pair<std::string, std::string>(propertyKey, element.substr(start, cursor - start))); |
|
|
|
|
start = cursor + 1; |
|
|
|
|
state = 1; |
|
|
|
|