You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

267 lines
8.1 KiB

#include "URL.h"
#include "StringUtils.h"
/*
* From the following RFC: https://tools.ietf.org/html/rfc3986
*
* pct-encoded = "%" HEXDIG HEXDIG
* reserved = gen-delims / sub-delims
* gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
* sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
* / "*" / "+" / "," / ";" / "="
*
*/
enum uri_parse_state {
SCHEME,
FIRST_SLASH,
SECOND_SLASH_OR_ELSE,
AUTHORITY,
AUTHORITY_USERINFO, /* The part before '@' */
AUTHORITY_PASSWORD, /* RFC states that we should either reject or ignore it (We ignore it) */
AUTHORITY_HOST,
AUTHORITY_PORT,
PATH,
QUERY,
FRAGMENT,
};
std::unique_ptr<URL> parseUri(std::string raw) {
std::unique_ptr<URL> uri = std::make_unique<URL>();
uri->path = "/";
unsigned int cursor = 0;
unsigned int last = 0;
unsigned int last_semicolon = 0;
enum uri_parse_state state = SCHEME;
// First character of scheme MUST be alphabetic
if (!isalpha(raw[cursor])) {
std::cout << "parse scheme error" << std::endl;
return NULL;
}
for (cursor = 1; cursor < raw.length(); cursor++) {
/* TODO
* Allow scheme-less uri (and fallback to https/http) */
if (state == SCHEME) {
if (raw[cursor] == ':') {
uri->scheme = toLowercase(raw.substr(0, cursor));
/* TODO
* Put default port now (Should use a table for that but
* I don't know C++ enough) */
if (uri->scheme == "http") {
uri->port = 80;
}
state = FIRST_SLASH;
} else if (!isalpha(raw[cursor]) && !isdigit(raw[cursor]) && raw[cursor] != '+' &&
raw[cursor] != '-' && raw[cursor] != '.') {
std::cout << "parse scheme error" << std::endl;
return NULL;
}
} else if (state == FIRST_SLASH) {
if (raw[cursor] == '/') {
state = SECOND_SLASH_OR_ELSE;
} else {
std::cout << "parse error" << std::endl;
}
} else if (state == SECOND_SLASH_OR_ELSE) {
if (raw[cursor] == '/') {
last = cursor + 1;
state = AUTHORITY;
} else {
// TODO Handle this, URI may have only one slash
}
} else if (state == AUTHORITY) {
/* At this point, this could either be the semi colon for
* the password or for the port*/
if (raw[cursor] == ':') {
last_semicolon = cursor;
} else if (raw[cursor] == '@') {
uri->userinfo = raw.substr(last, cursor - last);
last = cursor + 1;
state = AUTHORITY_HOST;
// Authority is finished, everything should be considered as the host[port].
// TODO terminated by the next slash ("/"), question mark ("?"), or number sign ("#") character, or by the end of the URI.
// What to do when ? and # ?
} else if (raw[cursor] == '/') {
if (last_semicolon > 0) {
// TODO Validate port
if (cursor - last_semicolon - 1 > 0) {
uri->port = std::stoi(raw.substr(last_semicolon+1, cursor - last_semicolon+1));
}
uri->host = raw.substr(last, last_semicolon - last);
} else {
uri->host = raw.substr(last, cursor - last);
}
last = cursor;
cursor--;
state = PATH;
} else if (raw[cursor] == '?' || raw[cursor] == '#') {
uri->host = raw.substr(last, cursor - last);
last = cursor;
if (raw[cursor] == '?') {
state = QUERY;
} else {
state = FRAGMENT;
}
} else if (cursor + 1 == raw.length()) {
uri->host = raw.substr(last, last_semicolon - last);
uri->path = "/";
break;
}
} else if (state == AUTHORITY_HOST) {
if (raw[cursor] == ':') {
uri->host = raw.substr(last, cursor - last);
last = cursor+1;
state = AUTHORITY_PORT;
} else if (raw[cursor] == '/') {
uri->host = raw.substr(last, cursor - last);
last = cursor;
cursor--;
state = PATH;
}
} else if (state == AUTHORITY_PORT) {
if (raw[cursor] == '/') {
if (cursor - last > 0) {
uri->port = std::stoi(raw.substr(last, cursor - last));
}
last = cursor;
cursor--;
state = PATH;
} else if (!isdigit(raw[cursor])) {
}
} else if (state == PATH) {
if (raw[cursor] == '?' || raw[cursor] == '#') {
uri->path = raw.substr(last, cursor - last);
last = cursor;
if (raw[cursor] == '?') {
state = QUERY;
} else {
state = FRAGMENT;
}
} else if (cursor + 1 == raw.length()) {
uri->path = raw.substr(last, cursor + 1 - last);
break;
}
} else if (state == QUERY) {
if (raw[cursor] == '#') {
uri->query = raw.substr(last + 1, cursor - last - 1);
last = cursor;
state = FRAGMENT;
} else if (cursor + 1 == raw.length()) {
uri->query = raw.substr(last + 1, cursor + 1 - last);
break;
}
} else if (state == FRAGMENT) {
if (cursor + 1 == raw.length()) {
uri->fragment = raw.substr(last + 1, cursor + 1 - last);
break;
}
}
}
return uri;
}
URL::URL() {
scheme = "";
userinfo = "";
host = "";
port = 0;
path = "";
query = "";
fragment = "";
}
URL::URL(std::string const& url) {
construct(url);
}
std::string URL::toString() const {
if (isRelative()) {
return path;
}
return scheme + "://" + host + path;
}
bool URL::isRelative() const {
return scheme.size() == 0;
}
URL URL::merge(URL const& url) const {
if (!url.isRelative()) {
return url.copy();
}
URL returnURL = copy();
if (url.path[0] == '/' && url.path[1] == '/') {
auto slashPos = url.path.find('/', 2);
returnURL.host = url.path.substr(2, slashPos - 2);
if (slashPos == std::string::npos) {
returnURL.path = "/";
} else {
returnURL.path = url.path.substr(slashPos);
}
} else if (url.path[0] == '/') {
returnURL.path = url.path;
} else {
if (returnURL.path.back() != '/') {
auto finalSlashPos = returnURL.path.find_last_of('/');
returnURL.path.erase(finalSlashPos + 1);
}
returnURL.path += url.path;
}
/*
auto hashPos = returnURL.document.find("#");
if (hashPos != std::string::npos) {
returnURL.document = returnURL.document.substr(0, hashPos);
}
*/
return returnURL;
}
void URL::construct(std::string const& url) {
std::unique_ptr<URL> uri=parseUri(url);
scheme = uri->scheme;
userinfo = uri->userinfo;
host = uri->host;
port = uri->port;
path = uri->path;
query = uri->query;
fragment = uri->fragment;
/*
scheme = getSchemeFromURL(url);
if (scheme.size() != 0) {
host = getHostFromURL(url);
path = getDocumentFromURL(url);
} else {
host = "";
path = url;
}
if (path.size() == 0) {
path = "/";
}
*/
}
URL URL::copy() const {
URL url;
url.scheme = scheme;
url.userinfo = userinfo;
url.host = host;
url.port = port;
url.path = path;
url.query = query;
url.fragment = fragment;
return url;
}
std::ostream& operator<<(std::ostream& out, URL const& url) {
out << url.toString();
return out;
}