You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

HTMLParser.cpp 14KB


  1. #include "HTMLParser.h"
  2. #include "TextNode.h"
  3. #include <algorithm>
  4. #include <iostream>
  5. #include <memory>
  6. // ok=ul, not ok=li
  7. void autoCloseTag(std::shared_ptr<Node> currentNode, std::shared_ptr<Node> rootNode, std::string ok, std::string notok) {
  8. // are we inside an ul? li?
  9. // recurse up to parent
  10. std::shared_ptr<Node> it = currentNode;
  11. //std::cout << "scanning li for unclosed lis" << std::endl;
  12. while(it != rootNode) {
  13. if (it->nodeType == NodeType::TAG) {
  14. TagNode *pTagNode = dynamic_cast<TagNode*>(it.get());
  15. if (pTagNode) {
  16. // tag node
  17. //std::cout << "scanning: " << pTagNode->tag << std::endl;
  18. if (pTagNode->tag == ok) {
  19. // we're ok, we found UL first
  20. it = rootNode; // mark done
  21. break;
  22. }
  23. if (pTagNode->tag == notok) {
  24. // we're not ok, we found UL first
  25. //std::cout << "need to close previous LI tag" << std::endl;
  26. // set our parent to be our sibling LI's parent
  27. // well currentNode is us
  28. if (currentNode && pTagNode->parent) {
  29. // we need to remove from children
  30. for(std::vector<std::shared_ptr<Node>>::iterator it2 = currentNode->parent->children.begin(); it2!=currentNode->parent->children.end(); ++it2) {
  31. if (it2->get() == currentNode.get()) {
  32. //std::cout << "found us in children" << std::endl;
  33. it2 = currentNode->parent->children.erase(it2);
  34. break;
  35. }
  36. }
  37. // move node under new parent
  38. currentNode->parent = pTagNode->parent;
  39. pTagNode->parent->children.push_back(currentNode);
  40. } else {
  41. std::cout << "HTMLParser::Parse - currentNode or pTagNode->parent - close previous " << notok << " tag" << std::endl;
  42. }
  43. it = rootNode; // mark done
  44. break;
  45. }
  46. }
  47. }
  48. // recurse up
  49. it = it->parent;
  50. }
  51. //std::cout << "scanned li for unclosed lis" << std::endl;
  52. }
  53. void printNode(const std::shared_ptr<Node> node, const int indent) {
  54. for (int i = 0; i < indent; i++) {
  55. std::cout << '\t';
  56. }
  57. if (node->nodeType == NodeType::ROOT) {
  58. std::cout << "ROOT\n" << std::endl;
  59. }
  60. else if (node->nodeType == NodeType::TAG) {
  61. std::cout << "TAG: " << dynamic_cast<TagNode*>(node.get())->tag << std::endl;
  62. for (const std::pair<std::string, std::string> property : dynamic_cast<TagNode*>(node.get())->properties) {
  63. for (int i = 0; i < indent; i++) {
  64. std::cout << '\t';
  65. }
  66. std::cout << " " << property.first << ": " << property.second << std::endl;
  67. }
  68. }
  69. else if (node->nodeType == NodeType::TEXT) {
  70. std::cout << "TEXT: " << dynamic_cast<TextNode*>(node.get())->text << std::endl;
  71. }
  72. for (std::shared_ptr<Node> child : node->children) {
  73. printNode(child, indent + 1);
  74. }
  75. }
  76. std::shared_ptr<Node> HTMLParser::parse(const std::string &html) const {
  77. std::shared_ptr<Node> rootNode = std::make_shared<Node>(NodeType::ROOT);
  78. std::shared_ptr<Node> currentNode = rootNode;
  79. std::shared_ptr<Node> startTagLevel = rootNode;
  80. std::vector<unsigned int> starts;
  81. unsigned int cursor;
  82. int state = 0;
  83. int prependWhiteSpace = false;
  84. for (cursor = 0; cursor < html.length(); cursor++) { // TODO handle trying to look ahead past string
  85. if (state == 0) { // Outside tag
  86. if (html[cursor] == ' ' || html[cursor] == '\t' || html[cursor] == '\r' || html[cursor] == '\n') {
  87. prependWhiteSpace = true;
  88. continue;
  89. }
  90. else if (html[cursor] == '<') {
  91. // HTML comments
  92. if (html[cursor + 1] == '!' && html[cursor + 2] == '-' && html[cursor + 3] == '-' ) {
  93. //std::cout << "HTMLParser::Parse - starting HTML comment at " << cursor << std::endl;
  94. state = 4;
  95. }
  96. // close tag
  97. else if (html[cursor + 1] == '/') {
  98. // start closing tag
  99. //std::cout << "HTMLParser::Parse - starting closing tag at " << html.substr(cursor, 7) << std::endl;
  100. if (currentNode && currentNode->parent) {
  101. // we should snap to the level we started at (as we maybe a couple levels deep <ul><li></ul>
  102. // but it's the matching part of this tag
  103. currentNode = currentNode->parent;
  104. } else {
  105. std::cout << "HTMLParser::Parse - currentNode/parent is null - close tag" << std::endl;
  106. }
  107. state = 1; // ignore closing tags
  108. }
  109. // these have never have a closing tag
  110. else if (
  111. (html[cursor + 1] == 'h' && html[cursor + 2] == 'r') ||
  112. (html[cursor + 1] == 'b' && html[cursor + 2] == 'r') ||
  113. (html[cursor + 1] == 'w' && html[cursor + 2] == 'b' && html[cursor + 3] == 'r') ||
  114. (html[cursor + 1] == 'i' && html[cursor + 2] == 'm' && html[cursor + 3] == 'g') ||
  115. (html[cursor + 1] == 'l' && html[cursor + 2] == 'i' && html[cursor + 3] == 'n' && html[cursor + 4] == 'k') ||
  116. (html[cursor + 1] == 'm' && html[cursor + 2] == 'e' && html[cursor + 3] == 't' && html[cursor + 4] == 'a') ||
  117. (html[cursor + 1] == 'i' && html[cursor + 2] == 'n' && html[cursor + 3] == 'p' && html[cursor + 4] == 'u' && html[cursor + 5] == 't')
  118. ) {
  119. //std::cout << "HTMLParser::Parse - Starting single tag " << html.substr(cursor, 6) << std::endl;
  120. std::shared_ptr<TagNode> tagNode = std::make_shared<TagNode>();
  121. if (currentNode) {
  122. currentNode->children.push_back(tagNode);
  123. tagNode->parent = currentNode;
  124. } else {
  125. std::cout << "HTMLParser::Parse - currentNode is null - tagNode" << std::endl;
  126. }
  127. currentNode = tagNode;
  128. size_t closeTagPos = html.substr(cursor + 1).find(">");
  129. //std::cout << "found closeTagPos at " << closeTagPos << std::endl;
  130. if (closeTagPos == std::string::npos) {
  131. std::cout << "HTMLParser::Parse - can't find closing tag for single tag" << std::endl;
  132. cursor ++;
  133. } else {
  134. std::string element = html.substr(cursor, closeTagPos + 2);
  135. //std::cout << "HTMLParser::Parse - creating element, tag: " << element << std::endl;
  136. parseTag(element, *dynamic_cast<TagNode*>(currentNode.get()));
  137. cursor += 2 + closeTagPos;
  138. }
  139. // drop back
  140. if (currentNode && currentNode->parent) {
  141. currentNode = currentNode->parent;
  142. } else {
  143. std::cout << "HTMLParser::Parse - currentNode/parent is null - textNode state3" << std::endl;
  144. }
  145. prependWhiteSpace = false;
  146. state = 0;
  147. }
  148. // start tag (<bob> <bob part)
  149. else {
  150. //std::cout << "HTMLParser::Parse - start oc tag " << html.substr(cursor, 6) << std::endl;
  151. std::shared_ptr<TagNode> tagNode = std::make_shared<TagNode>();
  152. if (currentNode) {
  153. currentNode->children.push_back(tagNode);
  154. tagNode->parent = currentNode;
  155. } else {
  156. std::cout << "HTMLParser::Parse - currentNode is null - tagNode" << std::endl;
  157. }
  158. currentNode = tagNode;
  159. starts.push_back(cursor);
  160. state = 2;
  161. }
  162. }
  163. else { // start text node
  164. //std::cout << "HTMLParser::Parse - start text node " << html.substr(cursor, 6) << std::endl;
  165. std::shared_ptr<TextNode> textNode = std::make_shared<TextNode>();
  166. // not sure why currentNode is null but it is
  167. if (currentNode) {
  168. currentNode->children.push_back(textNode);
  169. textNode->parent = currentNode;
  170. } else {
  171. std::cout << "HTMLParser::Parse - currentNode is null - textNode" << std::endl;
  172. }
  173. currentNode = textNode;
  174. starts.push_back(cursor);
  175. state = 3;
  176. }
  177. cursor--;
  178. }
  179. else if (state == 1) { // Skip Over Element (used by closing tag)
  180. if (html[cursor] == '>') {
  181. //std::cout << "HTMLParser::parse - close tag: " << html.substr(starts.back(), cursor - starts.back() + 1) << std::endl;
  182. state = 0;
  183. prependWhiteSpace = false;
  184. }
  185. }
  186. else if (state == 4) { // HTML Comment
  187. if (html[cursor] == '-' && html[cursor + 1] == '-' && html[cursor + 2] == '>') {
  188. //std::cout << "HTMLParser::Parse - Found end HTML comment at " << html.substr(cursor, 6) << std::endl;
  189. state = 0;
  190. cursor += 2; // advance cursor to end of comment
  191. prependWhiteSpace = false;
  192. }
  193. }
  194. else if (state == 2) { // Search for end tag node
  195. if (html[cursor] == '>') { // end tag node
  196. std::string element = html.substr(starts.back(), cursor - starts.back() + 1);
  197. //std::cout << "HTMLParser::parse - end open tag: " << element << std::endl;
  198. if (element == "<li>") {
  199. // this will close previous li before starting a new one
  200. autoCloseTag(currentNode, rootNode, "ul", "li");
  201. //std::cout << "scanned parents for unclosed lis" << std::endl;
  202. } else
  203. if (element == "<option>") { // FIXME: options have attributes
  204. // this will close previous option before starting a new one
  205. autoCloseTag(currentNode, rootNode, "select", "option");
  206. //std::cout << "scanned parents for unclosed options" << std::endl;
  207. }
  208. starts.pop_back();
  209. parseTag(element, *dynamic_cast<TagNode*>(currentNode.get()));
  210. state = 0;
  211. prependWhiteSpace = false;
  212. }
  213. }
  214. else if (state == 3) { // End text node
  215. if (html[cursor + 1] == '<') {
  216. dynamic_cast<TextNode*>(currentNode.get())->text = (prependWhiteSpace?" ":"") + html.substr(starts.back(), cursor - starts.back() + 1);
  217. //std::cout << "HTMLParser::parse - end text node: " << html.substr(starts.back(), cursor - starts.back() + 1) << std::endl;
  218. starts.pop_back();
  219. if (currentNode && currentNode->parent) {
  220. currentNode = currentNode->parent;
  221. } else {
  222. std::cout << "HTMLParser::Parse - currentNode/parent is null - textNode state3" << std::endl;
  223. }
  224. state = 0;
  225. prependWhiteSpace = false;
  226. }
  227. }
  228. }
  229. //printNode(rootNode, 0);
  230. return rootNode;
  231. }
  232. void HTMLParser::parseTag(const std::string &element, TagNode &tagNode) const {
  233. //std::cout << "HTMLParser::parseTag - element [" << element << "]" << std::endl;
  234. unsigned int cursor;
  235. unsigned int start = 1; // skip first <
  236. int state = 0;
  237. std::string propertyKey;
  238. for (cursor = 0; cursor < element.length(); cursor++) {
  239. if (state == 0) {
  240. if (element[cursor] == ' ' || element[cursor] == '>') {
  241. tagNode.tag = element.substr(start, cursor - start);
  242. std::transform(tagNode.tag.begin(), tagNode.tag.end(), tagNode.tag.begin(), tolower);
  243. start = cursor + 1;
  244. state = 1;
  245. }
  246. }
  247. else if (state == 1) {
  248. if (element[cursor] == ' ') {
  249. start = cursor + 1;
  250. }
  251. else if (element[cursor] == '=') {
  252. propertyKey = element.substr(start, cursor - start);
  253. state = 2;
  254. }
  255. }
  256. else if (state == 2) {
  257. if (element[cursor] == '"') {
  258. start = cursor + 1;
  259. state = 3;
  260. }
  261. else if (element[cursor] == '\'') {
  262. start = cursor + 1;
  263. state = 4;
  264. }
  265. }
  266. else if (state == 3) {
  267. if (element[cursor] == '"') {
  268. // was suggested to use tagNode.properties[propertyKey] = element.substr(start, cursor - start);
  269. // for better readabiilty
  270. tagNode.properties.insert(std::pair<std::string, std::string>(propertyKey, element.substr(start, cursor - start)));
  271. start = cursor + 1;
  272. state = 1;
  273. }
  274. }
  275. else if (state == 4) {
  276. if (element[cursor] == '\'') {
  277. tagNode.properties.insert(std::pair<std::string, std::string>(propertyKey, element.substr(start, cursor - start)));
  278. start = cursor + 1;
  279. state = 1;
  280. }
  281. }
  282. }
  283. }