├── CMakeLists.txt ├── test ├── makefile ├── README.txt └── test.cpp ├── Example.cpp ├── README.md ├── License.txt ├── LUrlParser.h └── LUrlParser.cpp /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | project("LUrlParser") 4 | 5 | add_executable(Example Example.cpp LUrlParser.cpp LUrlParser.h) 6 | 7 | set_property(TARGET Example PROPERTY CXX_STANDARD 11) 8 | set_property(TARGET Example PROPERTY CXX_STANDARD_REQUIRED ON) 9 | -------------------------------------------------------------------------------- /test/makefile: -------------------------------------------------------------------------------- 1 | all: tests 2 | 3 | # If you have googletest in non-standard paths, add them here 4 | #GTESTPATH= 5 | CXXFLAGS=-I${GTESTPATH}/include 6 | LDFLAGS=-L${GTESTPATH}/lib 7 | 8 | tests: ../*.cpp ../*.h test.cpp 9 | gcc -o tests test.cpp ../LUrlParser.cpp $(CXXFLAGS) -std=c++11 -lstdc++ $(LDFLAGS) -lgtest 10 | 11 | clean: 12 | rm -f ./tests 13 | -------------------------------------------------------------------------------- /Example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "LUrlParser.h" 5 | 6 | using namespace std; 7 | 8 | int main() 9 | { 10 | const auto URL = LUrlParser::ParseURL::parseURL("https://John:Dow@github.com:80/corporateshark/LUrlParser/?&query=ssl#q=frag"); 11 | 12 | if (URL.isValid()) 13 | { 14 | cout << "Scheme : " << URL.scheme_ << endl; 15 | cout << "Host : " << URL.host_ << endl; 16 | cout << "Port : " << URL.port_ << endl; 17 | cout << "Path : " << URL.path_ << endl; 18 | cout << "Query : " << URL.query_ << endl; 19 | cout << "Fragment : " << URL.fragment_ << endl; 20 | cout << "User name : " << URL.userName_ << endl; 21 | cout << "Password : " << URL.password_ << endl; 22 | } 23 | else 24 | { 25 | cout << "Parsing error: " << URL.errorCode_ << endl; 26 | } 27 | 28 | return 0; 29 | } 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **Lightweight URL & URI parser (RFC 1738, RFC 3986)** 2 | 3 | (C) Sergey Kosarevsky, 2015-2020 4 | 5 | @corporateshark sk@linderdaum.com 6 | 7 | http://www.linderdaum.com 8 | 9 | http://blog.linderdaum.com 10 | 11 | ============================= 12 | 13 | A tiny and lightweight URL & URI parser (RFC 1738, RFC 3986) written in C++. 14 | 15 | ============================= 16 | 17 | Usage example: 18 | 19 | const auto URL = LUrlParser::ParseURL::parseURL( "https://John:Dow@github.com:80/corporateshark/LUrlParser" ); 20 | 21 | if ( URL.isValid() ) 22 | { 23 | cout << "Scheme : " << URL.scheme_ << endl; 24 | cout << "Host : " << URL.host_ << endl; 25 | cout << "Port : " << URL.port_ << endl; 26 | cout << "Path : " << URL.path_ << endl; 27 | cout << "Query : " << URL.query_ << endl; 28 | cout << "Fragment : " << URL.fragment_ << endl; 29 | cout << "User name : " << URL.userName_ << endl; 30 | cout << "Password : " << URL.password_ << endl; 31 | } 32 | 33 | ============================= 34 | -------------------------------------------------------------------------------- /License.txt: -------------------------------------------------------------------------------- 1 | https://github.com/corporateshark/LUrlParser 2 | 3 | The MIT License (MIT) 4 | 5 | Copyright (C) 2015 Sergey Kosarevsky (sk@linderdaum.com) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /test/README.txt: -------------------------------------------------------------------------------- 1 | Unit tests based on GoogleTest 2 | 3 | Build and Run 4 | ------------- 5 | Run 'make' and './tests' 6 | 7 | If you have libgtest.so or libgtest.a in a special directory, prepend a definition for GTESTPATH on make commandline. 8 | 9 | GTESTPATH=/usr/local/special/include make 10 | 11 | Output 12 | -------- 13 | 14 | $ ./tests 15 | [==========] Running 8 tests from 1 test case. 16 | [----------] Global test environment set-up. 17 | [----------] 8 tests from UrlTest 18 | [ RUN ] UrlTest.example 19 | [ OK ] UrlTest.example (0 ms) 20 | [ RUN ] UrlTest.cases 21 | [ OK ] UrlTest.cases (0 ms) 22 | [ RUN ] UrlTest.ipv6 23 | [ OK ] UrlTest.ipv6 (0 ms) 24 | [ RUN ] UrlTest.ipv4 25 | [ OK ] UrlTest.ipv4 (0 ms) 26 | [ RUN ] UrlTest.parens 27 | [ OK ] UrlTest.parens (0 ms) 28 | [ RUN ] UrlTest.trailingdot 29 | [ OK ] UrlTest.trailingdot (0 ms) 30 | [ RUN ] UrlTest.specialchars 31 | [ OK ] UrlTest.specialchars (0 ms) 32 | [ RUN ] UrlTest.escapechars 33 | [ OK ] UrlTest.escapechars (0 ms) 34 | [----------] 8 tests from UrlTest (0 ms total) 35 | 36 | [----------] Global test environment tear-down 37 | [==========] 8 tests from 1 test case ran. (0 ms total) 38 | [ PASSED ] 8 tests. 39 | -------------------------------------------------------------------------------- /LUrlParser.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Lightweight URL & URI parser (RFC 1738, RFC 3986) 3 | * https://github.com/corporateshark/LUrlParser 4 | * 5 | * The MIT License (MIT) 6 | * 7 | * Copyright (C) 2015-2020 Sergey Kosarevsky (sk@linderdaum.com) 8 | * 9 | * Permission is hereby granted, free of charge, to any person obtaining a copy 10 | * of this software and associated documentation files (the "Software"), to deal 11 | * in the Software without restriction, including without limitation the rights 12 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | * copies of the Software, and to permit persons to whom the Software is 14 | * furnished to do so, subject to the following conditions: 15 | * 16 | * The above copyright notice and this permission notice shall be included in all 17 | * copies or substantial portions of the Software. 18 | * 19 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | * SOFTWARE. 26 | */ 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | namespace LUrlParser 33 | { 34 | enum LUrlParserError 35 | { 36 | LUrlParserError_Ok = 0, 37 | LUrlParserError_Uninitialized = 1, 38 | LUrlParserError_NoUrlCharacter = 2, 39 | LUrlParserError_InvalidSchemeName = 3, 40 | LUrlParserError_NoDoubleSlash = 4, 41 | LUrlParserError_NoAtSign = 5, 42 | LUrlParserError_UnexpectedEndOfLine = 6, 43 | LUrlParserError_NoSlash = 7, 44 | }; 45 | 46 | class ParseURL 47 | { 48 | public: 49 | LUrlParserError errorCode_ = LUrlParserError_Uninitialized; 50 | std::string scheme_; 51 | std::string host_; 52 | std::string port_; 53 | std::string path_; 54 | std::string query_; 55 | std::string fragment_; 56 | std::string userName_; 57 | std::string password_; 58 | 59 | /// return 'true' if the parsing was successful 60 | bool isValid() const { return errorCode_ == LUrlParserError_Ok; } 61 | 62 | /// helper to convert the port number to int, return 'true' if the port is valid (within the 0..65535 range) 63 | bool getPort(int* outPort) const; 64 | 65 | /// parse the URL 66 | static ParseURL parseURL(const std::string& url); 67 | 68 | private: 69 | ParseURL() = default; 70 | explicit ParseURL(LUrlParserError errorCode) 71 | : errorCode_(errorCode) 72 | {} 73 | }; 74 | } // namespace LUrlParser 75 | -------------------------------------------------------------------------------- /test/test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../LUrlParser.h" 3 | 4 | using LUrlParser::clParseURL; 5 | 6 | 7 | class UrlTest : public ::testing::Test { 8 | protected: 9 | virtual void SetUp() { 10 | 11 | } 12 | }; 13 | 14 | TEST_F(UrlTest, example) { 15 | 16 | clParseURL URL = clParseURL::ParseURL( "https://John:Dow@github.com:80/corporateshark/LUrlParser" ); 17 | ASSERT_TRUE(URL.IsValid()); 18 | ASSERT_EQ("https", URL.m_Scheme); 19 | ASSERT_EQ("github.com", URL.m_Host); 20 | ASSERT_EQ("80", URL.m_Port); 21 | ASSERT_EQ("corporateshark/LUrlParser", URL.m_Path); 22 | ASSERT_EQ("", URL.m_Query); 23 | } 24 | 25 | TEST_F(UrlTest, cases) { 26 | 27 | clParseURL URL = clParseURL::ParseURL( "HTTPS://ESPN.com/BOLD/PATH?a=A&B=b" ); 28 | ASSERT_TRUE(URL.IsValid()); 29 | 30 | ASSERT_EQ("https", URL.m_Scheme); // notice it's now lowercase 31 | ASSERT_EQ("ESPN.com", URL.m_Host); // case unchanged 32 | ASSERT_EQ("BOLD/PATH", URL.m_Path); 33 | ASSERT_EQ("a=A&B=b", URL.m_Query); 34 | } 35 | 36 | TEST_F(UrlTest, ipv6) { 37 | 38 | clParseURL URL = clParseURL::ParseURL( "https://[fe80::9a01:a7ff:feb1:7dc9]:80/corporateshark/LUrlParser" ); 39 | ASSERT_TRUE(URL.IsValid()); 40 | ASSERT_EQ("[fe80::9a01:a7ff:feb1:7dc9]", URL.m_Host); 41 | } 42 | 43 | TEST_F(UrlTest, ipv4) { 44 | 45 | clParseURL URL = clParseURL::ParseURL( "https://10.0.3.243/corporateshark/LUrlParser" ); 46 | ASSERT_TRUE(URL.IsValid()); 47 | ASSERT_EQ("10.0.3.243", URL.m_Host); 48 | } 49 | 50 | TEST_F(UrlTest, parens) { 51 | 52 | clParseURL URL = clParseURL::ParseURL( "https://en.wikipedia.org/wiki/Joe_Malone_(ice_hockey)" ); 53 | ASSERT_TRUE(URL.IsValid()); 54 | ASSERT_EQ("wiki/Joe_Malone_(ice_hockey)", URL.m_Path); 55 | } 56 | 57 | TEST_F(UrlTest, trailingdot) { 58 | 59 | clParseURL URL = clParseURL::ParseURL( "http://foo.com/blah_blah." ); 60 | ASSERT_TRUE(URL.IsValid()); 61 | ASSERT_EQ("blah_blah.", URL.m_Path); 62 | 63 | URL = clParseURL::ParseURL( "http://foo.com/blah_blah/." ); 64 | ASSERT_TRUE(URL.IsValid()); 65 | ASSERT_EQ("blah_blah/.", URL.m_Path); 66 | } 67 | 68 | TEST_F(UrlTest, specialchars) { 69 | 70 | clParseURL URL = clParseURL::ParseURL( "https://duckduckgo.com/?q=mark+twain&atb=v23_c&ia=web" ); 71 | ASSERT_TRUE(URL.IsValid()); 72 | ASSERT_EQ("q=mark+twain&atb=v23_c&ia=web", URL.m_Query); 73 | } 74 | 75 | TEST_F(UrlTest, escapechars) { 76 | 77 | clParseURL URL = clParseURL::ParseURL( "https://duckduckgo.com/?q=mark%20twain" ); 78 | ASSERT_TRUE(URL.IsValid()); 79 | ASSERT_EQ("q=mark%20twain", URL.m_Query); 80 | } 81 | 82 | /* // these currently still return valid 83 | 84 | TEST_F(UrlTest, controlchars) { 85 | 86 | clParseURL URL; 87 | URL = clParseURL::ParseURL( "https://en.wikipedia.org/bell\007/" ); 88 | ASSERT_FALSE(URL.IsValid()); 89 | 90 | URL = clParseURL::ParseURL( "https://en.wikipedia.org/line\njere/" ); 91 | ASSERT_FALSE(URL.IsValid()); 92 | } 93 | */ 94 | 95 | int main(int argc, char **argv) { 96 | ::testing::InitGoogleTest(&argc, argv); 97 | int status= RUN_ALL_TESTS(); 98 | return status; 99 | } 100 | -------------------------------------------------------------------------------- /LUrlParser.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Lightweight URL & URI parser (RFC 1738, RFC 3986) 3 | * https://github.com/corporateshark/LUrlParser 4 | * 5 | * The MIT License (MIT) 6 | * 7 | * Copyright (C) 2015-2020 Sergey Kosarevsky (sk@linderdaum.com) 8 | * 9 | * Permission is hereby granted, free of charge, to any person obtaining a copy 10 | * of this software and associated documentation files (the "Software"), to deal 11 | * in the Software without restriction, including without limitation the rights 12 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | * copies of the Software, and to permit persons to whom the Software is 14 | * furnished to do so, subject to the following conditions: 15 | * 16 | * The above copyright notice and this permission notice shall be included in all 17 | * copies or substantial portions of the Software. 18 | * 19 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | * SOFTWARE. 26 | */ 27 | 28 | #include "LUrlParser.h" 29 | 30 | #include 31 | #include 32 | #include 33 | 34 | namespace 35 | { 36 | // check if the scheme name is valid 37 | bool isSchemeValid(const std::string& schemeName) 38 | { 39 | for (auto c : schemeName) 40 | { 41 | if (!isalpha(c) && c != '+' && c != '-' && c != '.') return false; 42 | } 43 | 44 | return true; 45 | } 46 | } 47 | 48 | bool LUrlParser::ParseURL::getPort(int* outPort) const 49 | { 50 | if (!isValid()) { return false; } 51 | 52 | const int port = atoi(port_.c_str()); 53 | 54 | if (port <= 0 || port > 65535) { return false; } 55 | 56 | if (outPort) { *outPort = port; } 57 | 58 | return true; 59 | } 60 | 61 | // based on RFC 1738 and RFC 3986 62 | LUrlParser::ParseURL LUrlParser::ParseURL::parseURL(const std::string& URL) 63 | { 64 | LUrlParser::ParseURL result; 65 | 66 | const char* currentString = URL.c_str(); 67 | 68 | /* 69 | * : 70 | * := [a-z\+\-\.]+ 71 | * For resiliency, programs interpreting URLs should treat upper case letters as equivalent to lower case in scheme names 72 | */ 73 | 74 | // try to read scheme 75 | { 76 | const char* localString = strchr(currentString, ':'); 77 | 78 | if (!localString) 79 | { 80 | return ParseURL(LUrlParserError_NoUrlCharacter); 81 | } 82 | 83 | // save the scheme name 84 | result.scheme_ = std::string(currentString, localString - currentString); 85 | 86 | if (!isSchemeValid(result.scheme_)) 87 | { 88 | return ParseURL(LUrlParserError_InvalidSchemeName); 89 | } 90 | 91 | // scheme should be lowercase 92 | std::transform(result.scheme_.begin(), result.scheme_.end(), result.scheme_.begin(), ::tolower); 93 | 94 | // skip ':' 95 | currentString = localString + 1; 96 | } 97 | 98 | /* 99 | * //:@:/ 100 | * any ":", "@" and "/" must be normalized 101 | */ 102 | 103 | // skip "//" 104 | if (*currentString++ != '/') return ParseURL(LUrlParserError_NoDoubleSlash); 105 | if (*currentString++ != '/') return ParseURL(LUrlParserError_NoDoubleSlash); 106 | 107 | // check if the user name and password are specified 108 | bool bHasUserName = false; 109 | 110 | const char* localString = currentString; 111 | 112 | while (*localString) 113 | { 114 | if (*localString == '@') 115 | { 116 | // user name and password are specified 117 | bHasUserName = true; 118 | break; 119 | } 120 | else if (*localString == '/') 121 | { 122 | // end of : specification 123 | bHasUserName = false; 124 | break; 125 | } 126 | 127 | localString++; 128 | } 129 | 130 | // user name and password 131 | localString = currentString; 132 | 133 | if (bHasUserName) 134 | { 135 | // read user name 136 | while (*localString && *localString != ':' && *localString != '@') localString++; 137 | 138 | result.userName_ = std::string(currentString, localString - currentString); 139 | 140 | // proceed with the current pointer 141 | currentString = localString; 142 | 143 | if (*currentString == ':') 144 | { 145 | // skip ':' 146 | currentString++; 147 | 148 | // read password 149 | localString = currentString; 150 | 151 | while (*localString && *localString != '@') localString++; 152 | 153 | result.password_ = std::string(currentString, localString - currentString); 154 | 155 | currentString = localString; 156 | } 157 | 158 | // skip '@' 159 | if (*currentString != '@') 160 | { 161 | return ParseURL(LUrlParserError_NoAtSign); 162 | } 163 | 164 | currentString++; 165 | } 166 | 167 | const bool bHasBracket = (*currentString == '['); 168 | 169 | // go ahead, read the host name 170 | localString = currentString; 171 | 172 | while (*localString) 173 | { 174 | if (bHasBracket && *localString == ']') 175 | { 176 | // end of IPv6 address 177 | localString++; 178 | break; 179 | } 180 | else if (!bHasBracket && (*localString == ':' || *localString == '/')) 181 | { 182 | // port number is specified 183 | break; 184 | } 185 | 186 | localString++; 187 | } 188 | 189 | result.host_ = std::string(currentString, localString - currentString); 190 | 191 | currentString = localString; 192 | 193 | // is port number specified? 194 | if (*currentString == ':') 195 | { 196 | currentString++; 197 | 198 | // read port number 199 | localString = currentString; 200 | 201 | while (*localString && *localString != '/') localString++; 202 | 203 | result.port_ = std::string(currentString, localString - currentString); 204 | 205 | currentString = localString; 206 | } 207 | 208 | // end of string 209 | if (!*currentString) 210 | { 211 | result.errorCode_ = LUrlParserError_Ok; 212 | 213 | return result; 214 | } 215 | 216 | // skip '/' 217 | if (*currentString != '/') 218 | { 219 | return ParseURL(LUrlParserError_NoSlash); 220 | } 221 | 222 | currentString++; 223 | 224 | // parse the path 225 | localString = currentString; 226 | 227 | while (*localString && *localString != '#' && *localString != '?') localString++; 228 | 229 | result.path_ = std::string(currentString, localString - currentString); 230 | 231 | currentString = localString; 232 | 233 | // check for query 234 | if (*currentString == '?') 235 | { 236 | // skip '?' 237 | currentString++; 238 | 239 | // read query 240 | localString = currentString; 241 | 242 | while (*localString&&* localString != '#') localString++; 243 | 244 | result.query_ = std::string(currentString, localString - currentString); 245 | 246 | currentString = localString; 247 | } 248 | 249 | // check for fragment 250 | if (*currentString == '#') 251 | { 252 | // skip '#' 253 | currentString++; 254 | 255 | // read fragment 256 | localString = currentString; 257 | 258 | while (*localString) localString++; 259 | 260 | result.fragment_ = std::string(currentString, localString - currentString); 261 | 262 | currentString = localString; 263 | } 264 | 265 | result.errorCode_ = LUrlParserError_Ok; 266 | 267 | return result; 268 | } 269 | --------------------------------------------------------------------------------