You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

315 lines
15 KiB

#include "HTMLParser.h"
#include "TextNode.h"
#include <algorithm>
#include <iostream>
#include <memory>
// ok=ul, not ok=li
void autoCloseTag(std::shared_ptr<Node> currentNode, std::shared_ptr<Node> rootNode, std::string ok, std::string notok) {
// are we inside an ul? li?
// recurse up to parent
std::shared_ptr<Node> it = currentNode;
//std::cout << "scanning li for unclosed lis" << std::endl;
while(it != rootNode) {
if (it->nodeType == NodeType::TAG) {
TagNode *pTagNode = dynamic_cast<TagNode*>(it.get());
if (pTagNode) {
// tag node
//std::cout << "scanning: " << pTagNode->tag << std::endl;
if (pTagNode->tag == ok) {
// we're ok, we found UL first
it = rootNode; // mark done
break;
}
if (pTagNode->tag == notok) {
// we're not ok, we found UL first
//std::cout << "need to close previous LI tag" << std::endl;
// set our parent to be our sibling LI's parent
// well currentNode is us
if (currentNode && pTagNode->parent) {
// we need to remove from children
for(std::vector<std::shared_ptr<Node>>::iterator it2 = currentNode->parent->children.begin(); it2!=currentNode->parent->children.end(); ++it2) {
if (it2->get() == currentNode.get()) {
//std::cout << "found us in children" << std::endl;
it2 = currentNode->parent->children.erase(it2);
break;
}
}
// move node under new parent
currentNode->parent = pTagNode->parent;
pTagNode->parent->children.push_back(currentNode);
} else {
std::cout << "HTMLParser::Parse - currentNode or pTagNode->parent - close previous " << notok << " tag" << std::endl;
}
it = rootNode; // mark done
break;
}
}
}
// recurse up
it = it->parent;
}
//std::cout << "scanned li for unclosed lis" << std::endl;
}
void printNode(const std::shared_ptr<Node> node, const int indent) {
for (int i = 0; i < indent; i++) {
std::cout << '\t';
}
if (node->nodeType == NodeType::ROOT) {
std::cout << "ROOT\n" << std::endl;
}
else if (node->nodeType == NodeType::TAG) {
std::cout << "TAG: " << dynamic_cast<TagNode*>(node.get())->tag << std::endl;
for (const std::pair<std::string, std::string> property : dynamic_cast<TagNode*>(node.get())->properties) {
for (int i = 0; i < indent; i++) {
std::cout << '\t';
}
std::cout << " " << property.first << ": " << property.second << std::endl;
}
}
else if (node->nodeType == NodeType::TEXT) {
std::cout << "TEXT: " << dynamic_cast<TextNode*>(node.get())->text << std::endl;
}
for (std::shared_ptr<Node> child : node->children) {
printNode(child, indent + 1);
}
}
std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
std::shared_ptr<Node> rootNode = std::make_shared<Node>(NodeType::ROOT);
std::shared_ptr<Node> currentNode = rootNode;
std::shared_ptr<Node> startTagLevel = rootNode;
std::vector<unsigned int> starts;
unsigned int cursor;
int state = 0;
int prependWhiteSpace = false;
for (cursor = 0; cursor < html.length(); cursor++) { // TODO handle trying to look ahead past string
if (state == 0) { // Outside tag
if (html[cursor] == ' ' || html[cursor] == '\t' || html[cursor] == '\r' || html[cursor] == '\n') {
prependWhiteSpace = true;
continue;
}
else if (html[cursor] == '<') {
// HTML comments
if (html[cursor + 1] == '!' && html[cursor + 2] == '-' && html[cursor + 3] == '-' ) {
//std::cout << "HTMLParser::Parse - starting HTML comment at " << cursor << std::endl;
state = 4;
}
// close tag
else if (html[cursor + 1] == '/') {
// start closing tag
//std::cout << "HTMLParser::Parse - starting closing tag at " << html.substr(cursor, 7) << std::endl;
if (currentNode && currentNode->parent) {
// we should snap to the level we started at (as we maybe a couple levels deep <ul><li></ul>
// but it's the matching part of this tag
currentNode = currentNode->parent;
} else {
std::cout << "HTMLParser::Parse - currentNode/parent is null - close tag" << std::endl;
}
state = 1; // ignore closing tags
}
// these have never have a closing tag
else if (
(html[cursor + 1] == 'h' && html[cursor + 2] == 'r') ||
(html[cursor + 1] == 'b' && html[cursor + 2] == 'r') ||
(html[cursor + 1] == 'w' && html[cursor + 2] == 'b' && html[cursor + 3] == 'r') ||
(html[cursor + 1] == 'i' && html[cursor + 2] == 'm' && html[cursor + 3] == 'g') ||
(html[cursor + 1] == 'l' && html[cursor + 2] == 'i' && html[cursor + 3] == 'n' && html[cursor + 4] == 'k') ||
(html[cursor + 1] == 'm' && html[cursor + 2] == 'e' && html[cursor + 3] == 't' && html[cursor + 4] == 'a') ||
(html[cursor + 1] == 'i' && html[cursor + 2] == 'n' && html[cursor + 3] == 'p' && html[cursor + 4] == 'u' && html[cursor + 5] == 't')
) {
//std::cout << "HTMLParser::Parse - Starting single tag " << html.substr(cursor, 6) << std::endl;
std::shared_ptr<TagNode> tagNode = std::make_shared<TagNode>();
if (currentNode) {
currentNode->children.push_back(tagNode);
tagNode->parent = currentNode;
} else {
std::cout << "HTMLParser::Parse - currentNode is null - tagNode" << std::endl;
}
currentNode = tagNode;
size_t closeTagPos = html.substr(cursor + 1).find(">");
//std::cout << "found closeTagPos at " << closeTagPos << std::endl;
if (closeTagPos == std::string::npos) {
std::cout << "HTMLParser::Parse - can't find closing tag for single tag" << std::endl;
cursor ++;
} else {
std::string element = html.substr(cursor, closeTagPos + 2);
//std::cout << "HTMLParser::Parse - creating element, tag: " << element << std::endl;
parseTag(element, *dynamic_cast<TagNode*>(currentNode.get()));
cursor += 2 + closeTagPos;
}
// drop back
if (currentNode && currentNode->parent) {
currentNode = currentNode->parent;
} else {
std::cout << "HTMLParser::Parse - currentNode/parent is null - textNode state3" << std::endl;
}
prependWhiteSpace = false;
state = 0;
}
// start tag (<bob> <bob part)
else {
//std::cout << "HTMLParser::Parse - start oc tag " << html.substr(cursor, 6) << std::endl;
std::shared_ptr<TagNode> tagNode = std::make_shared<TagNode>();
if (currentNode) {
currentNode->children.push_back(tagNode);
tagNode->parent = currentNode;
} else {
std::cout << "HTMLParser::Parse - currentNode is null - tagNode" << std::endl;
}
currentNode = tagNode;
starts.push_back(cursor);
state = 2;
}
}
else { // start text node
//std::cout << "HTMLParser::Parse - start text node " << html.substr(cursor, 6) << std::endl;
std::shared_ptr<TextNode> textNode = std::make_shared<TextNode>();
// not sure why currentNode is null but it is
if (currentNode) {
currentNode->children.push_back(textNode);
textNode->parent = currentNode;
} else {
std::cout << "HTMLParser::Parse - currentNode is null - textNode" << std::endl;
}
currentNode = textNode;
starts.push_back(cursor);
state = 3;
}
cursor--;
}
else if (state == 1) { // Skip Over Element (used by closing tag)
if (html[cursor] == '>') {
//std::cout << "HTMLParser::parse - close tag: " << html.substr(starts.back(), cursor - starts.back() + 1) << std::endl;
state = 0;
prependWhiteSpace = false;
}
}
else if (state == 4) { // HTML Comment
if (html[cursor] == '-' && html[cursor + 1] == '-' && html[cursor + 2] == '>') {
//std::cout << "HTMLParser::Parse - Found end HTML comment at " << html.substr(cursor, 6) << std::endl;
state = 0;
cursor += 2; // advance cursor to end of comment
prependWhiteSpace = false;
}
}
else if (state == 2) { // Search for end tag node
if (html[cursor] == '>') { // end tag node
std::string element = html.substr(starts.back(), cursor - starts.back() + 1);
//std::cout << "HTMLParser::parse - end open tag: " << element << std::endl;
if (element == "<li>") {
// this will close previous li before starting a new one
autoCloseTag(currentNode, rootNode, "ul", "li");
//std::cout << "scanned parents for unclosed lis" << std::endl;
} else
if (element == "<option>") { // FIXME: options have attributes
// this will close previous option before starting a new one
autoCloseTag(currentNode, rootNode, "select", "option");
//std::cout << "scanned parents for unclosed options" << std::endl;
}
starts.pop_back();
parseTag(element, *dynamic_cast<TagNode*>(currentNode.get()));
state = 0;
prependWhiteSpace = false;
}
}
else if (state == 3) { // End text node
if (html[cursor + 1] == '<') {
dynamic_cast<TextNode*>(currentNode.get())->text = (prependWhiteSpace?" ":"") + html.substr(starts.back(), cursor - starts.back() + 1);
//std::cout << "HTMLParser::parse - end text node: " << html.substr(starts.back(), cursor - starts.back() + 1) << std::endl;
starts.pop_back();
if (currentNode && currentNode->parent) {
currentNode = currentNode->parent;
} else {
std::cout << "HTMLParser::Parse - currentNode/parent is null - textNode state3" << std::endl;
}
state = 0;
prependWhiteSpace = false;
}
}
}
//printNode(rootNode, 0);
return rootNode;
}
void HTMLParser::parseTag(const std::string &element, TagNode &tagNode) const {
//std::cout << "HTMLParser::parseTag - element [" << element << "]" << std::endl;
unsigned int cursor;
unsigned int start = 1; // skip first <
int state = 0;
std::string propertyKey;
for (cursor = 0; cursor < element.length(); cursor++) {
if (state == 0) {
// space or end
if (element[cursor] == ' ' || element[cursor] == '>') {
// set our tag (type / name, i.e. h1)
tagNode.tag = element.substr(start, cursor - start);
// make sure our tag is lowercase
std::transform(tagNode.tag.begin(), tagNode.tag.end(), tagNode.tag.begin(), tolower);
start = cursor + 1;
state = 1;
}
}
else if (state == 1) { // attribute search
if (element[cursor] == ' ') {
start = cursor + 1;
}
else if (element[cursor] == '=') {
propertyKey = element.substr(start, cursor - start);
start = cursor + 1; // start for non quotes
state = 2;
}
}
else if (state == 2) { // after = of attribute
if (element[cursor] == '"') {
start = cursor + 1;
state = 3;
}
else if (element[cursor] == '\'') {
start = cursor + 1;
state = 4;
} else if (element[cursor] == ' ') {
// we just probably found an end of attribute without quotes
tagNode.properties.insert(std::pair<std::string, std::string>(propertyKey, element.substr(start, cursor - start)));
start = cursor + 1;
state = 1;
}
}
else if (state == 3) {
if (element[cursor] == '"') {
// was suggested to use tagNode.properties[propertyKey] = element.substr(start, cursor - start);
// for better readabiilty
tagNode.properties.insert(std::pair<std::string, std::string>(propertyKey, element.substr(start, cursor - start)));
start = cursor + 1;
state = 1;
}
}
else if (state == 4) {
if (element[cursor] == '\'') {
tagNode.properties.insert(std::pair<std::string, std::string>(propertyKey, element.substr(start, cursor - start)));
start = cursor + 1;
state = 1;
}
}
}
// 2 is attribute without quotes, 3/4 is unclosed quote
if (state == 2 || state == 3 || state ==4) {
// we were in an atrr=
tagNode.properties.insert(std::pair<std::string, std::string>(propertyKey, element.substr(start, cursor - start - 1)));
} else {
if (state != 1) {
// so what's ending on state 0 about (somethin about no atttributes and maybe no tag name/type)
std::cout << "HTMLParser::parseTag ending on state " << state << std::endl;
}
}
}