Browse Source

unification of tomleb's RFC compliant URI struct and nubben's URL struct

pull/2/head
Odilitime 5 years ago
parent
commit
8813aa4eb2
  1. 233
      src/URL.cpp
  2. 22
      src/URL.h

233
src/URL.cpp

@ -2,10 +2,175 @@ @@ -2,10 +2,175 @@
#include "StringUtils.h"
/*
* From the following RFC: https://tools.ietf.org/html/rfc3986
*
* pct-encoded = "%" HEXDIG HEXDIG
* reserved = gen-delims / sub-delims
* gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
* sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
* / "*" / "+" / "," / ";" / "="
*
*/
enum uri_parse_state {
SCHEME,
FIRST_SLASH,
SECOND_SLASH_OR_ELSE,
AUTHORITY,
AUTHORITY_USERINFO, /* The part before '@' */
AUTHORITY_PASSWORD, /* RFC states that we should either reject or ignore it (We ignore it) */
AUTHORITY_HOST,
AUTHORITY_PORT,
PATH,
QUERY,
FRAGMENT,
};
std::unique_ptr<URL> parseUri(std::string raw) {
std::unique_ptr<URL> uri = std::make_unique<URL>();
uri->path = "/";
unsigned int cursor = 0;
unsigned int last = 0;
unsigned int last_semicolon = 0;
enum uri_parse_state state = SCHEME;
// First character of scheme MUST be alphabetic
if (!isalpha(raw[cursor])) {
std::cout << "parse scheme error" << std::endl;
return NULL;
}
for (cursor = 1; cursor < raw.length(); cursor++) {
/* TODO
* Allow scheme-less uri (and fallback to https/http) */
if (state == SCHEME) {
if (raw[cursor] == ':') {
uri->scheme = toLowercase(raw.substr(0, cursor));
/* TODO
* Put default port now (Should use a table for that but
* I don't know C++ enough) */
if (uri->scheme == "http") {
uri->port = 80;
}
state = FIRST_SLASH;
} else if (!isalpha(raw[cursor]) && !isdigit(raw[cursor]) && raw[cursor] != '+' &&
raw[cursor] != '-' && raw[cursor] != '.') {
std::cout << "parse scheme error" << std::endl;
return NULL;
}
} else if (state == FIRST_SLASH) {
if (raw[cursor] == '/') {
state = SECOND_SLASH_OR_ELSE;
} else {
std::cout << "parse error" << std::endl;
}
} else if (state == SECOND_SLASH_OR_ELSE) {
if (raw[cursor] == '/') {
last = cursor + 1;
state = AUTHORITY;
} else {
// TODO Handle this, URI may have only one slash
}
} else if (state == AUTHORITY) {
/* At this point, this could either be the semi colon for
* the password or for the port*/
if (raw[cursor] == ':') {
last_semicolon = cursor;
} else if (raw[cursor] == '@') {
uri->userinfo = raw.substr(last, cursor - last);
last = cursor + 1;
state = AUTHORITY_HOST;
// Authority is finished, everything should be considered as the host[port].
// TODO terminated by the next slash ("/"), question mark ("?"), or number sign ("#") character, or by the end of the URI.
// What to do when ? and # ?
} else if (raw[cursor] == '/') {
if (last_semicolon > 0) {
// TODO Validate port
if (cursor - last_semicolon - 1 > 0) {
uri->port = std::stoi(raw.substr(last_semicolon+1, cursor - last_semicolon+1));
}
uri->host = raw.substr(last, last_semicolon - last);
} else {
uri->host = raw.substr(last, cursor - last);
}
last = cursor;
cursor--;
state = PATH;
} else if (raw[cursor] == '?' || raw[cursor] == '#') {
uri->host = raw.substr(last, cursor - last);
last = cursor;
if (raw[cursor] == '?') {
state = QUERY;
} else {
state = FRAGMENT;
}
} else if (cursor + 1 == raw.length()) {
uri->host = raw.substr(last, last_semicolon - last);
uri->path = "/";
break;
}
} else if (state == AUTHORITY_HOST) {
if (raw[cursor] == ':') {
uri->host = raw.substr(last, cursor - last);
last = cursor+1;
state = AUTHORITY_PORT;
} else if (raw[cursor] == '/') {
uri->host = raw.substr(last, cursor - last);
last = cursor;
cursor--;
state = PATH;
}
} else if (state == AUTHORITY_PORT) {
if (raw[cursor] == '/') {
if (cursor - last > 0) {
uri->port = std::stoi(raw.substr(last, cursor - last));
}
last = cursor;
cursor--;
state = PATH;
} else if (!isdigit(raw[cursor])) {
}
} else if (state == PATH) {
if (raw[cursor] == '?' || raw[cursor] == '#') {
uri->path = raw.substr(last, cursor - last);
last = cursor;
if (raw[cursor] == '?') {
state = QUERY;
} else {
state = FRAGMENT;
}
} else if (cursor + 1 == raw.length()) {
uri->path = raw.substr(last, cursor + 1 - last);
break;
}
} else if (state == QUERY) {
if (raw[cursor] == '#') {
uri->query = raw.substr(last + 1, cursor - last - 1);
last = cursor;
state = FRAGMENT;
} else if (cursor + 1 == raw.length()) {
uri->query = raw.substr(last + 1, cursor + 1 - last);
break;
}
} else if (state == FRAGMENT) {
if (cursor + 1 == raw.length()) {
uri->fragment = raw.substr(last + 1, cursor + 1 - last);
break;
}
}
}
return uri;
}
URL::URL() {
protocol = "";
scheme = "";
userinfo = "";
host = "";
document = "";
port = 0;
path = "";
query = "";
fragment = "";
}
URL::URL(std::string const& url) {
@ -14,13 +179,13 @@ URL::URL(std::string const& url) { @@ -14,13 +179,13 @@ URL::URL(std::string const& url) {
std::string URL::toString() const {
if (isRelative()) {
return document;
return path;
}
return protocol + "://" + host + document;
return scheme + "://" + host + path;
}
bool URL::isRelative() const {
return protocol.size() == 0;
return scheme.size() == 0;
}
URL URL::merge(URL const& url) const {
@ -30,53 +195,69 @@ URL URL::merge(URL const& url) const { @@ -30,53 +195,69 @@ URL URL::merge(URL const& url) const {
URL returnURL = copy();
if (url.document[0] == '/' && url.document[1] == '/') {
auto slashPos = url.document.find('/', 2);
returnURL.host = url.document.substr(2, slashPos - 2);
if (url.path[0] == '/' && url.path[1] == '/') {
auto slashPos = url.path.find('/', 2);
returnURL.host = url.path.substr(2, slashPos - 2);
if (slashPos == std::string::npos) {
returnURL.document = "/";
returnURL.path = "/";
} else {
returnURL.document = url.document.substr(slashPos);
returnURL.path = url.path.substr(slashPos);
}
} else if (url.document[0] == '/') {
returnURL.document = url.document;
} else if (url.path[0] == '/') {
returnURL.path = url.path;
} else {
if (returnURL.document.back() != '/') {
auto finalSlashPos = returnURL.document.find_last_of('/');
returnURL.document.erase(finalSlashPos + 1);
if (returnURL.path.back() != '/') {
auto finalSlashPos = returnURL.path.find_last_of('/');
returnURL.path.erase(finalSlashPos + 1);
}
returnURL.document += url.document;
returnURL.path += url.path;
}
/*
auto hashPos = returnURL.document.find("#");
if (hashPos != std::string::npos) {
returnURL.document = returnURL.document.substr(0, hashPos);
}
*/
return returnURL;
}
void URL::construct(std::string const& url) {
protocol = getProtocolFromURL(url);
if (protocol.size() != 0) {
std::unique_ptr<URL> uri=parseUri(url);
scheme = uri->scheme;
userinfo = uri->userinfo;
host = uri->host;
port = uri->port;
path = uri->path;
query = uri->query;
fragment = uri->fragment;
/*
scheme = getSchemeFromURL(url);
if (scheme.size() != 0) {
host = getHostFromURL(url);
document = getDocumentFromURL(url);
path = getDocumentFromURL(url);
} else {
host = "";
document = url;
path = url;
}
if (document.size() == 0) {
document = "/";
if (path.size() == 0) {
path = "/";
}
*/
}
URL URL::copy() const {
URL url;
url.protocol = protocol;
url.scheme = scheme;
url.userinfo = userinfo;
url.host = host;
url.document = document;
url.port = port;
url.path = path;
url.query = query;
url.fragment = fragment;
return url;
}

22
src/URL.h

@ -2,10 +2,15 @@ @@ -2,10 +2,15 @@
#define URL_H
#include <string>
#include <memory>
#include <iostream>
// I'm really mixed about this being a struct like this
// probably fine for now
struct Authority {
std::string userinfo;
std::string host;
int port;
};
struct URL {
URL();
URL(std::string const& url);
@ -14,15 +19,22 @@ struct URL { @@ -14,15 +19,22 @@ struct URL {
bool isRelative() const;
URL merge(URL const& url) const;
std::string protocol;
// switching to the official RFC names
std::string scheme; // was protocol
//struct Authority authority; /* Can be empty */
std::string userinfo;
std::string host;
std::string document;
int port;
std::string path; // was document
std::string query; // ?blablabla=asdfasf....
std::string fragment; // #asfawefm
private:
void construct(std::string const& url);
URL copy() const;
};
std::unique_ptr<URL> parseUri(std::string raw);
std::ostream& operator<<(std::ostream& out, URL const& url);
#endif

Loading…
Cancel
Save