├── systemd ├── tlgs.conf ├── tlgs_crawler.timer ├── tlgs_crawler.service └── tlgs_server.service ├── tlgsutils ├── tests │ ├── pch.hpp │ ├── main.cpp │ ├── url_blacklist_test.cpp │ ├── url_parser_test.cpp │ ├── utils_test.cpp │ ├── robots_txt_parser_test.cpp │ └── gemini_parser_test.cpp ├── CMakeLists.txt ├── robots_txt_parser.hpp ├── counter.hpp ├── url_blacklist.hpp ├── gemini_parser.hpp ├── utils.hpp ├── url_parser.hpp ├── robots_txt_parser.cpp ├── utils.cpp ├── gemini_parser.cpp ├── url_parser.cpp └── trie.hpp ├── tlgs ├── crawler │ ├── blacklist.hpp │ ├── pch.h │ ├── CMakeLists.txt │ ├── main.cpp │ ├── iconv.hpp │ ├── crawler.hpp │ ├── blacklist.cpp │ └── crawler.cpp ├── CMakeLists.txt ├── server │ ├── pch.h │ ├── contents │ │ ├── .well-known │ │ │ └── security.txt │ │ └── robots.txt │ ├── templates │ │ ├── header.csp │ │ ├── backlinks.csp │ │ ├── homepage.csp │ │ ├── known_hosts.csp │ │ ├── doc_backlinks.csp │ │ ├── statistics.csp │ │ ├── doc_api.csp │ │ ├── about.csp │ │ ├── doc_search.csp │ │ ├── search_result.csp │ │ ├── doc_indexing.csp │ │ └── news.csp │ ├── search_result.hpp │ ├── CMakeLists.txt │ ├── controllers │ │ ├── api.cpp │ │ └── tools.cpp │ └── main.cpp ├── tlgs_ctl │ ├── CMakeLists.txt │ └── main.cpp ├── config.json └── server_config.json ├── .gitmodules ├── .gitignore ├── LICENSE ├── Package.cmake ├── diesgn_docs └── misc.md ├── CMakeLists.txt └── README.md /systemd/tlgs.conf: -------------------------------------------------------------------------------- 1 | u tlgs - "tlgs" /usr/sbin/nologin -------------------------------------------------------------------------------- /tlgsutils/tests/pch.hpp: -------------------------------------------------------------------------------- 1 | #include -------------------------------------------------------------------------------- /tlgs/crawler/blacklist.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | bool inBlacklist(const std::string& url_str); -------------------------------------------------------------------------------- /systemd/tlgs_crawler.timer: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Run TLGS Craler 3 | 4 | [Timer] 5 | OnCalendar=Wed,Sun 00:00:00 6 | 7 | [Install] 8 | WantedBy=timers.target -------------------------------------------------------------------------------- /tlgs/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(crawler) 2 | add_subdirectory(server) 3 | add_subdirectory(tlgs_ctl) 4 | 5 | install(FILES config.json server_config.json DESTINATION /etc/tlgs/) -------------------------------------------------------------------------------- /tlgs/server/pch.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __cplusplus 4 | #include 5 | #include 6 | #include 7 | #endif -------------------------------------------------------------------------------- /tlgsutils/tests/main.cpp: -------------------------------------------------------------------------------- 1 | #define DROGON_TEST_MAIN 2 | #include 3 | using namespace drogon; 4 | 5 | int main(int argc, char** argv) 6 | { 7 | test::run(argc, argv); 8 | } 9 | -------------------------------------------------------------------------------- /tlgs/server/contents/.well-known/security.txt: -------------------------------------------------------------------------------- 1 | Contact: marty1885@protonmail.com 2 | Expires: Tue, 5 Mar 2024 00:00 +0800 3 | Encryption: https://clehaxze.tw/misc/gpg_public.txt 4 | Preferred-Languages: en, zh-TW, eo 5 | -------------------------------------------------------------------------------- /tlgs/server/contents/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: /search 3 | Disallow: /v/search 4 | Disallow: /search_jump 5 | Disallow: /v/search_jump 6 | Disallow: /add_seed 7 | Disallow: /backlinks 8 | Disallow: /api 9 | -------------------------------------------------------------------------------- /tlgs/tlgs_ctl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(tlgs_ctl main.cpp) 2 | target_link_libraries(tlgs_ctl PRIVATE Drogon::Drogon) 3 | install(TARGETS tlgs_ctl RUNTIME DESTINATION bin) 4 | target_compile_features(tlgs_ctl PRIVATE cxx_std_20) -------------------------------------------------------------------------------- /tlgs/crawler/pch.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __cplusplus 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #endif -------------------------------------------------------------------------------- /tlgs/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "app":{ 3 | "handle_sig_term": false 4 | }, 5 | "db_clients": [ 6 | { 7 | "rdbms": "postgresql", 8 | "host": "127.0.0.1", 9 | "port": 5432, 10 | "dbname": "tlgs", 11 | "number_of_connections": 4, 12 | "timeout": 10 13 | } 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /systemd/tlgs_crawler.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=TLGS Crawler 3 | After=network.target 4 | 5 | [Service] 6 | Type=simple 7 | Restart=always 8 | RestartSec=1 9 | StartLimitBurst=5 10 | StartLimitIntervalSec=10 11 | WorkingDirectory=/tmp/ 12 | User=tlgs 13 | ExecStart=/usr/bin/tlgs_crawler /etc/tlgs/config.json -c 4 14 | 15 | [Install] 16 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /systemd/tlgs_server.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=TLGS Server 3 | After=network.target 4 | 5 | [Service] 6 | Type=simple 7 | Restart=always 8 | RestartSec=1 9 | StartLimitBurst=5 10 | StartLimitIntervalSec=10 11 | User=tlgs 12 | WorkingDirectory=/etc/tlgs 13 | ExecStart=/usr/bin/tlgs_server /etc/tlgs/server_config.json 14 | 15 | [Install] 16 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "thrid_party/dremini"] 2 | path = thrid_party/dremini 3 | url = https://github.com/marty1885/dremini 4 | [submodule "thrid_party/landlock-unveil"] 5 | path = thrid_party/landlock-unveil 6 | url = https://github.com/marty1885/landlock-unveil 7 | [submodule "thrid_party/spartoi"] 8 | path = thrid_party/spartoi 9 | url = https://github.com/marty1885/spartoi 10 | -------------------------------------------------------------------------------- /tlgs/crawler/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(tlgs_crawler main.cpp blacklist.cpp crawler.cpp) 2 | target_compile_features(tlgs_crawler PRIVATE cxx_std_20) 3 | find_package(Iconv REQUIRED) 4 | find_package(fmt REQUIRED) 5 | target_link_libraries(tlgs_crawler PRIVATE Drogon::Drogon Iconv::Iconv dremini tlgsutils tbb fmt::fmt) 6 | target_precompile_headers(tlgs_crawler PRIVATE pch.h) 7 | install(TARGETS tlgs_crawler RUNTIME DESTINATION bin) -------------------------------------------------------------------------------- /tlgs/server/templates/header.csp: -------------------------------------------------------------------------------- 1 | <%c++ 2 | std::string title = @@.get("title"); 3 | if(!title.empty()) { 4 | $$ << "# " << title << "\n"; 5 | } 6 | else { 7 | $$ << "# TLGS - \"Totally Legit\" Gemini Search"; 8 | } 9 | 10 | std::string search_url; 11 | if(@@.get("verbose")) 12 | search_url = "/v/search"; 13 | else 14 | search_url = "/search"; 15 | %> 16 | 17 | => / 🏠 Home 18 | => {%search_url%} 🔍 Search 19 | => /backlinks? 🔙 Query backlinks 20 | -------------------------------------------------------------------------------- /tlgs/server/search_result.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | struct SearchResult 5 | { 6 | std::string url; 7 | std::string title; 8 | std::string content_type; 9 | std::string preview; 10 | std::string last_crawled_at; 11 | size_t size; 12 | float score; 13 | }; 14 | 15 | struct ServerStatistics 16 | { 17 | std::string update_time; 18 | int64_t page_count; 19 | int64_t domain_count; 20 | std::vector> content_type_count; 21 | }; -------------------------------------------------------------------------------- /tlgs/server/templates/backlinks.csp: -------------------------------------------------------------------------------- 1 | <%view header %> 2 | 3 | <%c++ 4 | const auto& internal_backlinks = @@.get>("internal_backlinks"); 5 | const auto& external_backlinks = @@.get>("external_backlinks"); 6 | 7 | $$ << external_backlinks.size() << " cross-capsule backlinks\n"; 8 | for(const auto& url : external_backlinks) 9 | $$ << "=>" << url << "\n"; 10 | 11 | $$ << "\n"; 12 | $$ << internal_backlinks.size() << " internal backlinks\n"; 13 | for(const auto& url : internal_backlinks) 14 | $$ << "=>" << url << "\n"; 15 | %> -------------------------------------------------------------------------------- /tlgs/server/templates/homepage.csp: -------------------------------------------------------------------------------- 1 | <%view header %> 2 | 3 | ## Tools and information 4 | => /statistics 📈 TLGS Statistics 5 | => /known-hosts ☑️ Hosts known to TLGS 6 | => /news 📰 TLGS News 7 | => /add_seed 🌱 Missing your capsule? Add it to TLGS 8 | 9 | ## About TLGS 10 | 11 | => /about 📖 About TLGS 12 | => /doc/index 📖 Documentation: Indexing 13 | => /doc/search 📖 Documentation: Searching 14 | => /doc/backlinks 📖 Documentation: Backlinks 15 | => /doc/api 📖 Documentation: API 16 | => gemini://gemini.clehaxze.tw The author's capsule 17 | => https://github.com/marty1885/tlgs TLGS Source Code -------------------------------------------------------------------------------- /tlgs/server/templates/known_hosts.csp: -------------------------------------------------------------------------------- 1 | <%view header %> 2 | 3 | This is a list of hosts known to TLGS. It is generated from the index TLGS uses for search. Thus, your capsule won't show up in search results if it does not also show up here. If your capsule is missing, you can use the following link to add it to our index. 4 | 5 | => /add_seed Missing your capsule? Add it to TLGS 6 | 7 | Here's all the capsules known by TLGS 8 | 9 | <%c++ 10 | auto hosts = @@.get>>("hosts"); 11 | for(const auto& host : *hosts) { 12 | $$ << "=> " << host << "\n"; 13 | } 14 | %> -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Compiled Dynamic libraries 12 | *.so 13 | *.dylib 14 | *.dll 15 | 16 | # Fortran module files 17 | *.mod 18 | *.smod 19 | 20 | # Compiled Static libraries 21 | *.lai 22 | *.la 23 | *.a 24 | *.lib 25 | 26 | # Executables 27 | *.exe 28 | *.out 29 | *.app 30 | 31 | build/ 32 | cmake-build-debug/ 33 | .idea/ 34 | html/ 35 | latex/ 36 | .vscode 37 | *.kdev4 38 | .cproject 39 | .project 40 | .settings/ 41 | .vs/ 42 | CMakeSettings.json 43 | install 44 | trace.json 45 | .cache/ 46 | compile_commands.json 47 | 48 | -------------------------------------------------------------------------------- /tlgsutils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(tlgsutils gemini_parser.cpp robots_txt_parser.cpp url_parser.cpp utils.cpp) 2 | target_link_libraries(tlgsutils PUBLIC Drogon::Drogon dremini xxhash) 3 | target_compile_features(tlgsutils PRIVATE cxx_std_20) 4 | 5 | if(TLGS_BUILD_TESTS) 6 | add_executable(tlgsutils_test tests/main.cpp 7 | tests/gemini_parser_test.cpp 8 | tests/robots_txt_parser_test.cpp 9 | tests/url_parser_test.cpp 10 | tests/utils_test.cpp 11 | tests/url_blacklist_test.cpp) 12 | target_link_libraries(tlgsutils_test Drogon::Drogon tlgsutils) 13 | target_include_directories(tlgsutils_test PRIVATE .) 14 | target_precompile_headers(tlgsutils_test PRIVATE tests/pch.hpp) 15 | ParseAndAddDrogonTests(tlgsutils_test) 16 | endif() 17 | -------------------------------------------------------------------------------- /tlgs/server/templates/doc_backlinks.csp: -------------------------------------------------------------------------------- 1 | <%view header %> 2 | 3 | ### Backlinks 4 | Given a page, backlinks are all the pages that links to said page. TLGS's "Query backlinks" feature allows users to retrieve all the backlinks of a given page. This is useful for discovering new, releated capsules and gauging how popular a page is from a different angle. For example: 5 | 6 | => /backlinks?gemini.circumlunar.space 7 | 8 | Note the distinction of internal and cross-capsule backlinks. Internal backlinks are backlinks originating from the same capsule as the target page. And cross-capsule baclinks are those originating from outside the same capsule. For now determining if a backlink is internal or not is purely by checking if the host portion of the URL is the same or not. A more indepth method shall be implemented in the future. -------------------------------------------------------------------------------- /tlgsutils/robots_txt_parser.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace tlgs 8 | { 9 | /** 10 | * @brief Parse robots.txt and return a set of disallowed paths. 11 | * 12 | * @param str robots.txt content 13 | * @param agents the user agents we care about. '*' must be explicitly specified otherwise it is ignored 14 | */ 15 | std::vector parseRobotsTxt(const std::string& str, const std::set& agents); 16 | /** 17 | * @brief Check if the path is disallowed by a set of robots.txt rules. 18 | * 19 | * @param str the path to check 20 | * @param disallowed set of robots.txt rules 21 | * @note As of now, this function only supports the * wildcard. ?, [], etc... is undefined behavior. 22 | */ 23 | bool isPathBlocked(const std::string& str, const std::vector& disallowed); 24 | bool isPathBlocked(const std::string& str, const std::string& disallowed_path); 25 | } 26 | -------------------------------------------------------------------------------- /tlgs/server/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(tlgs_server 2 | main.cpp 3 | controllers/search.cpp 4 | controllers/tools.cpp 5 | controllers/api.cpp) 6 | target_compile_features(tlgs_server PRIVATE cxx_std_20) 7 | find_package(fmt REQUIRED) 8 | target_link_libraries(tlgs_server PRIVATE Drogon::Drogon dremini tlgsutils fmt::fmt spartoi) 9 | if(CMAKE_SYSTEM_NAME STREQUAL "Linux") 10 | target_link_libraries(tlgs_server PRIVATE llunveil) 11 | endif() 12 | target_include_directories(tlgs_server PRIVATE .) 13 | drogon_create_views(tlgs_server ${CMAKE_CURRENT_SOURCE_DIR}/templates 14 | ${CMAKE_CURRENT_BINARY_DIR}) 15 | target_precompile_headers(tlgs_server PRIVATE pch.h) 16 | add_custom_command( 17 | TARGET tlgs_server POST_BUILD 18 | COMMAND ${CMAKE_COMMAND} 19 | -E 20 | copy_directory 21 | ${CMAKE_CURRENT_SOURCE_DIR}/contents 22 | $/contents) 23 | 24 | install(TARGETS tlgs_server RUNTIME DESTINATION bin) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © 2021 Martin Chang 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /tlgs/server/templates/statistics.csp: -------------------------------------------------------------------------------- 1 | <%inc 2 | #include "search_result.hpp" 3 | #include 4 | #include 5 | %> 6 | <%c++ 7 | 8 | auto leftpad = [](auto x, size_t num) { 9 | std::stringstream ss; 10 | ss << std::setw(num) << x; 11 | return ss.str(); 12 | }; 13 | 14 | auto server_stat = @@.get>("server_stat"); 15 | %> 16 | 17 | <%view header %> 18 | 19 | ### Pages and domains 20 | 21 | The number of pages and domains known to TLGS at {%server_stat->update_time%}. These figures are updated every 6Hrs or so. 22 | 23 | ``` 24 | {%leftpad(server_stat->page_count, 8)%} : pages 25 | {%leftpad(server_stat->domain_count, 8)%} : domains 26 | ``` 27 | 28 | ### By content type 29 | 30 | The distribution of file types withing TLGS's index at {%server_stat->update_time%}. These figures are updated every 6Hrs or so. 31 | 32 | ``` 33 | <%c++ 34 | for(const auto& [content_type, count] : server_stat->content_type_count) 35 | $$ << leftpad(count, 8) << " : " << content_type << "\n"; 36 | %> 37 | ``` 38 | 39 | -------------------------------------------------------------------------------- /tlgsutils/counter.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace tlgs 7 | { 8 | 9 | struct Counter 10 | { 11 | Counter(std::atomic& counter) 12 | : counter_(&counter) 13 | { 14 | count_ = counter_->fetch_add(1, std::memory_order_acq_rel); 15 | } 16 | 17 | Counter(Counter&& other) 18 | { 19 | other.counter_ = counter_; 20 | other.count_ = count_; 21 | counter_ = nullptr; 22 | } 23 | 24 | size_t count() const 25 | { 26 | return count_; 27 | } 28 | 29 | ~Counter() 30 | { 31 | if(counter_ != nullptr) 32 | counter_->fetch_sub(1, std::memory_order_acq_rel); 33 | } 34 | 35 | size_t release() 36 | { 37 | if(!counter_) 38 | return -1; 39 | size_t n = counter_->fetch_sub(1, std::memory_order_acq_rel)-1; 40 | counter_ = nullptr; 41 | count_ = -1; 42 | return n; 43 | } 44 | 45 | size_t count_ = -1; 46 | std::atomic* counter_; 47 | }; 48 | 49 | } 50 | -------------------------------------------------------------------------------- /Package.cmake: -------------------------------------------------------------------------------- 1 | include(GNUInstallDirs) 2 | 3 | set(CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE") 4 | set(CPACK_PACKAGE_CONTACT "https://github.com/marty1885/tlgs") 5 | 6 | set(CPACK_PACKAGE_NAME "tlgs") 7 | set(CPACK_PACKAGE_VERSION "0.0.0") 8 | set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Search enging for the Gemini Protocol") 9 | 10 | # DEB 11 | # Figure out dependencies automatically. 12 | set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON) 13 | 14 | # Should be set automatically, but it is not. 15 | execute_process(COMMAND dpkg --print-architecture 16 | OUTPUT_VARIABLE CPACK_DEBIAN_PACKAGE_ARCHITECTURE 17 | OUTPUT_STRIP_TRAILING_WHITESPACE) 18 | 19 | # The default does not produce valid Debian package names. 20 | set(CPACK_DEBIAN_FILE_NAME 21 | "${CPACK_PACKAGE_NAME}_${CPACK_PACKAGE_VERSION}-0_${CPACK_DEBIAN_PACKAGE_ARCHITECTURE}.deb") 22 | 23 | # RPM 24 | set(CPACK_RPM_PACKAGE_LICENSE "MIT") 25 | 26 | # Figure out dependencies automatically. 27 | set(CPACK_RPM_PACKAGE_AUTOREQ ON) 28 | 29 | # Should be set automatically, but it is not. 30 | execute_process(COMMAND uname -m 31 | OUTPUT_VARIABLE CPACK_RPM_PACKAGE_ARCHITECTURE 32 | OUTPUT_STRIP_TRAILING_WHITESPACE) 33 | 34 | set(CPACK_PACKAGE_FILE_NAME 35 | "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-0.${CPACK_RPM_PACKAGE_ARCHITECTURE}") 36 | 37 | include(CPack) -------------------------------------------------------------------------------- /tlgsutils/url_blacklist.hpp: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include 5 | #include 6 | 7 | #include "url_parser.hpp" 8 | #include "robots_txt_parser.hpp" 9 | 10 | namespace tlgs 11 | { 12 | 13 | struct UrlBlacklist 14 | { 15 | 16 | void add(const std::string& url_str) 17 | { 18 | tlgs::Url url(url_str); 19 | if(url.good() == false) 20 | throw std::runtime_error("Invalid URL: " + url_str); 21 | std::string key = tlgs::Url(url).withFragment("").withPath("").withParam("").str(); 22 | blacklisted_.insert({key, url.path()}); 23 | } 24 | 25 | bool isBlocked(const tlgs::Url& url) const 26 | { 27 | std::string key = tlgs::Url(url).withFragment("").withPath("").withParam("").str(); 28 | auto [begin, end] = blacklisted_.equal_range(key); 29 | if(begin == end) 30 | return false; 31 | 32 | for(auto it = begin; it != end; it++) { 33 | const auto& path = it->second; 34 | if(tlgs::isPathBlocked(url.path(), path)) 35 | return true; 36 | } 37 | return false; 38 | } 39 | 40 | bool isBlocked(const std::string& str) const 41 | { 42 | return isBlocked(tlgs::Url(str)); 43 | } 44 | 45 | std::unordered_multimap blacklisted_; 46 | }; 47 | 48 | } 49 | -------------------------------------------------------------------------------- /diesgn_docs/misc.md: -------------------------------------------------------------------------------- 1 | # TLGS design doc 2 | 3 | a.k.a. My rambling about why I made something someway 4 | 5 | ## PostgreSQL 6 | 7 | The decition to use PostgreSQL instead of dedicated search engines like ElasticSarch is solely due to the complexity in deployment. Not that I can't deploy complicated applications. But it makes contributing (hopefully the project will have) to be hard. Setting Nextcloud for personal use is already annoning enough. I think people will be detured from contributing just because how long it takes to properly setup a development enviroment. 8 | 9 | ## Why HITS as raking algorithm 10 | 11 | It is easier in the earlier development process. PageRank run on the entire network graph instead of a subset. Thus it scales less and requires more inital work to get going. HITS on the other hand runs on a subset of the graph. Making it easier to debug since I don't need to have a an entire database ready (this is from an early stage of development). The original idea was to use SALSA. But SALSA requires the text search algorithm to determine which nodes are the hubs. It turns out to be hard to do. Full text search returns the entire list of possible sites without any room for potential hubs in most cases. Crumblimg the algorithm. 12 | 13 | ~~Totally would want to switch to SALSA if I can. To avoid TKC problem.~~ 14 | **Update:** Ditched HITS. Now using SALSA. Turns out the original SALSA paper has a easy way to distingush hubs and auths. 15 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.15) 2 | project(tlgs) 3 | 4 | option(TLGS_BUILD_TESTS "Build TLGS tests" ON) 5 | 6 | set(CMAKE_CXX_STANDARD 20) 7 | set(CMAKE_CXX_STANDARD_REQUIRED TRUE) 8 | 9 | # include(CheckCXXCompilerFlag) 10 | # check_cxx_compiler_flag("-fstack-protector-strong" HAVE_FSTACK_PROTECTOR) 11 | # if(HAVE_FSTACK_PROTECTOR) 12 | # message(STATUS "Stack protector enabled") 13 | # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector-strong") 14 | # endif() 15 | 16 | set(DREMINI_BUILD_EXAMPLES OFF) 17 | set(DREMINI_BUILD_TEST OFF) 18 | set(SPARTOI_BUILD_EXAMPLES OFF) 19 | 20 | # Enable tail call optimization so coroutines doesn't blow up the stack 21 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -foptimize-sibling-calls") 22 | 23 | if(CMAKE_SYSTEM_NAME STREQUAL "Linux") 24 | add_subdirectory(thrid_party/landlock-unveil) 25 | include_directories(thrid_party/landlock-unveil) 26 | endif() 27 | add_subdirectory(thrid_party/dremini) 28 | include_directories(thrid_party/dremini) 29 | add_subdirectory(thrid_party/spartoi) 30 | include_directories(thrid_party/spartoi) 31 | 32 | find_package(Drogon REQUIRED) 33 | include_directories(.) 34 | add_subdirectory(tlgsutils) 35 | add_subdirectory(tlgs) 36 | 37 | install(FILES systemd/tlgs_crawler.service systemd/tlgs_crawler.timer systemd/tlgs_server.service DESTINATION /etc/systemd/) 38 | install(FILES systemd/tlgs.conf DESTINATION lib/sysuser.d) 39 | 40 | include(Package.cmake) -------------------------------------------------------------------------------- /tlgs/server_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "app":{ 3 | "document_root": "./contents", 4 | "file_types": [ 5 | "gif", 6 | "png", 7 | "jpg", 8 | "js", 9 | "css", 10 | "html", 11 | "ico", 12 | "swf", 13 | "xap", 14 | "apk", 15 | "cur", 16 | "xml", 17 | "gmi", 18 | "txt" 19 | ], 20 | "use_implicit_page": true, 21 | "implicit_page": "index.gmi", 22 | "home_page": "index.gmi", 23 | "mime": { 24 | "text/gemini": ["gmi", "gemini"] 25 | }, 26 | "handle_sig_term": false 27 | }, 28 | "plugins": [ 29 | { 30 | "name": "dremini::GeminiServerPlugin", 31 | "config": { 32 | "listeners": [ 33 | { 34 | "key": "../../key.pem", 35 | "cert": "../../cert.pem", 36 | "ip": "0.0.0.0", 37 | "port": 1966 38 | } 39 | ], 40 | "numThread": 3, 41 | "translate_to_html": true 42 | } 43 | }, 44 | { 45 | "name": "spartoi::SpartanServerPlugin", 46 | "config": { 47 | "listeners": [ 48 | { 49 | "ip": "0.0.0.0", 50 | "port": 3000 51 | } 52 | ], 53 | "numThread": 3, 54 | "translate_to_html": true 55 | } 56 | } 57 | ], 58 | "listeners": [ 59 | { 60 | "ip": "0.0.0.0", 61 | "port": 8808 62 | } 63 | ], 64 | "db_clients": [ 65 | { 66 | "rdbms": "postgresql", 67 | "host": "127.0.0.1", 68 | "port": 5432, 69 | "dbname": "tlgs", 70 | "number_of_connections": 4, 71 | "timeout": 10 72 | } 73 | ], 74 | "custom_config": { 75 | "tlgs": { 76 | "ranking_algo": "salsa" 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /tlgsutils/gemini_parser.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include "url_parser.hpp" 10 | 11 | namespace tlgs 12 | { 13 | 14 | struct GeminiDocument 15 | { 16 | std::string text; 17 | std::string title; 18 | std::vector links; 19 | }; 20 | 21 | /** 22 | * @brief Parse and convert a Gemini document into title 23 | * , plaintext(without styling and heading) and links 24 | * 25 | * @param sv the Gemini document 26 | */ 27 | GeminiDocument extractGemini(const std::string_view sv); 28 | GeminiDocument extractGemini(const std::vector& nodes); 29 | 30 | /** 31 | * @brief Parse and convert a Gemini document into title 32 | * , plaintext(without styling and heading and ASCII art) and links 33 | * 34 | * @param sv the Gemini document 35 | */ 36 | GeminiDocument extractGeminiConcise(const std::string_view sv); 37 | GeminiDocument extractGeminiConcise(const std::vector& nodes); 38 | 39 | /** 40 | * @brief Check if a Gemini page can be interperd as Gemsub using heuristics 41 | * 42 | * @param doc The parsed Gemini document 43 | * @param feed_url the url of the Gemini page. If set, the chceking heuristics only considers feed entries 44 | * that are hosted on the same domain as feed_url 45 | * @param 46 | */ 47 | bool isGemsub(const std::vector& nodes); 48 | bool isGemsub(const std::vector& nodes, const tlgs::Url& feed_url, const std::string_view protocol = ""); 49 | } 50 | -------------------------------------------------------------------------------- /tlgs/server/templates/doc_api.csp: -------------------------------------------------------------------------------- 1 | <%view header %> 2 | 3 | To assist with (potential) data sharing between servers. TLGS provides a set of APIs that returns some useful internal information in a format that is easy for computers to understand; mostly JSON. Note that these APIs are intentionally slowed down to prevent abuse. The purpose of the APIs should be self explanatory. 4 | 5 | The API avaliable are geared towards sharing important information from TLGS' index that other servers may need. While crawling the entire geminispace being too expensive for them. 6 | 7 | ## Known resources 8 | 9 | => /api/v1/known_hosts 10 | 11 | Sends back a list of known hosts. Hosts are distinguished by accessable domain name, the IP address and the port. If the same host is reachable by multiple IP addresses/domain name or via different ports. It will show up multiple times in the list. 12 | 13 | => /api/v1/known_feeds 14 | 15 | Sends back a list of known feeds. By default this API returns all Atom feeds as that's the default format for Gemini. It also supports RSS and twtxt by setting the paremeter. ex: `/api/v1/known_feeds?rss` 16 | 17 | Supported formats: atom, rss, twtxt, gemsub 18 | 19 | => /api/v1/known_secutity_txt 20 | 21 | [WIP] Sends back a list of known security.txt (RFC9116) files. These file include contact information for the host in case of a security issue. The list provided are files accessible by the TLGS crawler. i.e. Not blocked by robots.txt not not in a capsule that TLGS blacklists. 22 | 23 | => /api/v1/known_perma_redirects 24 | 25 | Sends back a list of known permanent redirects (Gemini status code 31). To potentially allow new archivers and search engines to quickly bootstrap their index instead of wasting time following redirects. -------------------------------------------------------------------------------- /tlgsutils/tests/url_blacklist_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | DROGON_TEST(BlacklistTest) 5 | { 6 | tlgs::UrlBlacklist blacklist; 7 | blacklist.add("gemini://example.com/"); 8 | CHECK(blacklist.isBlocked("gemini://") == false); 9 | CHECK(blacklist.isBlocked("gemini://example.com/") == true); 10 | CHECK(blacklist.isBlocked("gemini://example.com/index.gmi") == true); 11 | CHECK(blacklist.isBlocked("gemini://example.com") == true); 12 | CHECK(blacklist.isBlocked("gemini://example.org/") == false); 13 | 14 | blacklist.add("gemini://example.org/"); 15 | CHECK(blacklist.isBlocked("gemini://example.org/") == true); 16 | CHECK(blacklist.isBlocked("gemini://example.org/index.gmi") == true); 17 | 18 | blacklist.add("gemini://example.net/cgi-bin"); 19 | CHECK(blacklist.isBlocked("gemini://example.net/cgi-bin/get-data?123456") == true); 20 | CHECK(blacklist.isBlocked("gemini://example.net/cgi-bin/get-data?123456#123") == true); 21 | CHECK(blacklist.isBlocked("gemini://example.net/cgi-bin") == true); 22 | CHECK(blacklist.isBlocked("gemini://example.net/data/cgi-bin") == false); 23 | 24 | CHECK(blacklist.isBlocked("gemini://example.online/") == false); 25 | CHECK(blacklist.isBlocked("gemini://example") == false); 26 | CHECK(blacklist.isBlocked("http://example.com") == false); 27 | 28 | blacklist.add("gemini://example.gov/data/"); 29 | CHECK(blacklist.isBlocked("gemini://example.gov/data") == false); 30 | 31 | // test normalisation 32 | CHECK(blacklist.isBlocked("gemini://example.gov/test/../data/") == true); 33 | CHECK(blacklist.isBlocked(tlgs::Url("gemini://example.gov/test/../data/")) == true); 34 | // this fails as the path needs to be normalised 35 | // CHECK(blacklist.isBlocked(tlgs::Url("gemini://example.gov/test/../data/", false)) == true); 36 | 37 | // test fragment 38 | blacklist.add("gemini://example.gov/data3"); 39 | CHECK(blacklist.isBlocked("gemini://example.gov/data3#test") == true); 40 | } 41 | -------------------------------------------------------------------------------- /tlgs/crawler/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "crawler.hpp" 8 | 9 | #include "CLI/App.hpp" 10 | #include "CLI/Formatter.hpp" 11 | #include "CLI/Config.hpp" 12 | 13 | using namespace drogon; 14 | using namespace trantor; 15 | 16 | int main(int argc, char** argv) 17 | { 18 | trantor::Logger::setLogLevel(trantor::Logger::LogLevel::kInfo); 19 | CLI::App cli{"TLGS crawler"}; 20 | 21 | std::string seed_link_file; 22 | size_t concurrent_connections = 1; 23 | bool force_reindex = false; 24 | std::string config_file = "/etc/tlgs/config.json"; 25 | cli.add_option("-s,--seed", seed_link_file, "Path to seed links for initalizing crawling"); 26 | cli.add_option("-c", concurrent_connections, "Number of concurrent connections"); 27 | cli.add_option("--force-reindex", force_reindex, "Force re-indexing of all links"); 28 | cli.add_option("config_file", config_file, "Path to TLGS config file"); 29 | 30 | CLI11_PARSE(cli, argc, argv); 31 | LOG_INFO << "Loading config from " << config_file; 32 | app().loadConfigFile(config_file); 33 | 34 | app().getLoop()->queueInLoop(async_func([&]() -> Task { 35 | auto crawler = std::make_shared(app().getIOLoop(0)); 36 | crawler->setMaxConcurrentConnections(concurrent_connections); 37 | crawler->enableForceReindex(force_reindex); 38 | if(!seed_link_file.empty()) { 39 | std::ifstream in(seed_link_file); 40 | if(in.is_open() == false) { 41 | LOG_ERROR << "Cannot open " << seed_link_file; 42 | abort(); 43 | } 44 | std::string line; 45 | while (std::getline(in, line)) { 46 | if(line.empty()) 47 | continue; 48 | LOG_INFO << "Seed added: " << line << "\n"; 49 | crawler->addUrl(line); 50 | } 51 | } 52 | 53 | co_await crawler->crawlAll(); 54 | app().quit(); 55 | })); 56 | 57 | 58 | app().run(); 59 | } 60 | -------------------------------------------------------------------------------- /tlgsutils/utils.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "url_parser.hpp" 7 | #include 8 | 9 | 10 | namespace tlgs 11 | { 12 | /** 13 | * @brief Detects if the string is ASCII art using a simple heuristic 14 | * 15 | * @param str the string to check 16 | */ 17 | bool isAsciiArt(const std::string& str); 18 | 19 | /** 20 | * @brief URL encode 21 | * 22 | * @param str the string to encode 23 | */ 24 | std::string urlEncode(const std::string_view str); 25 | 26 | /** 27 | * @brief Compose a URL and a path 28 | * 29 | * @param url the URL 30 | * @param path the path 31 | * @begincode 32 | * linkCompose(url("https://example.com"), "/path/to/file.txt") == "https://example.com/path/to/file.txt"; 33 | * @endcode 34 | */ 35 | tlgs::Url linkCompose(const tlgs::Url& url, const std::string& path); 36 | 37 | /** 38 | * @brief Detect URIs like mailto:xxxx ldap:xxxx (Without ://) 39 | * 40 | * @param sv the uri to check 41 | */ 42 | bool isNonUriAction(const std::string& sv); 43 | 44 | /** 45 | * @brief Computes xxhash of a string 46 | * 47 | * @param str the string 48 | * @return std::string hex encoded hash 49 | */ 50 | std::string xxHash64(const std::string_view str); 51 | 52 | template 53 | requires std::is_invocable_v 54 | auto filter(const T& data, Func&& func) 55 | { 56 | std::vector ret; 57 | std::copy_if(data.begin(), data.end(), std::back_inserter(ret), std::forward(func)); 58 | return ret; 59 | } 60 | 61 | template 62 | requires std::is_invocable_v 63 | auto map(const T& data, Func&& func) 64 | { 65 | using func_ret_type = decltype(func(data.front())); 66 | std::vector ret; 67 | ret.reserve(data.size()); 68 | std::transform(data.begin(), data.end(), std::back_inserter(ret), std::forward(func)); 69 | return ret; 70 | } 71 | 72 | std::optional try_strtoull(const std::string& str); 73 | 74 | std::string pgSQLRealEscape(std::string str); 75 | 76 | /** 77 | * @brief Convert URL into index-friendly string 78 | */ 79 | std::string indexFriendly(const tlgs::Url& url); 80 | } // namespace tlgs -------------------------------------------------------------------------------- /tlgs/server/templates/about.csp: -------------------------------------------------------------------------------- 1 | <%view header %> 2 | 3 | ## About TLGS 4 | 5 | TLGS, the Totally Legit Gemini Search is an experimental search engine for contents served over the Gemini Protocol. It crawls and indexes textual contents that it encounters in the Geminispace. And provides a interface for people to look for what they need. The interface is heavly inspired by GUS and geminispace.info. 6 | 7 | TLGS is built on a (at the time) state of the art programming language - Ru.... C++20! And it uses Drogon as the base framework for networking, database, etc... 8 | 9 | => https://github.com/drogonframework/drogon Drogon - A fast C++ web application framework 10 | 11 | ## A new search engine? 12 | 13 | TLGS is not based on search engine/provider that preexisted on Gemini. It's a pet project, a new enging designed to index and provide search with as little computation resources as possible. There's no real motivation behind the authors developing it. Hence the "Totally Legit" part, mocking myself. It's just fun and I think Gemini is a cool project. 14 | 15 | The source code of TLGS is available at the following link. And we welcome all contributions: 16 | 17 | => https://github.com/marty1885/tlgs TLGS Source Code 18 | 19 | ## Contacting the Admins 20 | 21 | You may reach the admins of tlgs.one (there might be other instances of TLGS on Geminispace) via email at pilot \at tlgs.one 22 | 23 | Currently it's a one crew team. I'll be more than happy to share the load if this ended up important to the Geminispace. Makes sure the project being alive when I'm busy or just not in the mood of doing projects. 24 | 25 | ## The Obligatory Gemini Introduction 26 | 27 | What is and why Gemini? That's a good question. Gemini is an new protocol designed in 2019 that attempts to bring the early internet back to the modern age. Why? The modern web has some serious issues, especially from a technology point of view. For example. HTML is overly difficult to parse and layout, modern web design is too information sparse, tracking through fingirprint and JavaScript. Gemini is an alternative aims to solve them. The protocol is by specification encrypted, the gemini markup language is easy to parse, identity on Gemini is verified through client certificates, etc... 28 | 29 | However, Gemini is not intended to replace the common Web. But to coexist with it. Gemini focueses on textual and information dense pages. Like personal blogs (called Gemlogs) news and discussion threads. There exist proxies from the common Web to Gemini and vise versa. However it's encouraged to use a Gemini browser to browse contents on Geminispace. Also, Gemini does not have a styling system. The presentation of a site is fully determined by the browser. 30 | 31 | Hop over to the Gemini Project homepage to learn more about it! 32 | 33 | => gemini://gemini.circumlunar.space/ Gemini Project 34 | -------------------------------------------------------------------------------- /tlgsutils/url_parser.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace tlgs 6 | { 7 | 8 | struct Url 9 | { 10 | Url() = default; 11 | Url(const std::string& str, bool normalize_url = true); 12 | Url(const Url&) = default; 13 | Url(Url&&) = default; 14 | Url& operator=(const tlgs::Url&) = default; 15 | inline bool good() const { return good_; } 16 | std::string str() const; 17 | std::string hostWithPort(unsigned short default_port) const; 18 | Url& withHost(const std::string& new_host); 19 | Url& withPath(const std::string& new_path, bool normalize_path = true); 20 | Url& withParam(const std::string& new_param); 21 | Url& withProtocol(const std::string& new_protocol); 22 | Url& withPort(unsigned short new_port); 23 | Url& withPort(); 24 | Url& withDefaultPort(unsigned short n); 25 | Url& normalize(); 26 | Url& withFragment(const std::string& new_fragment); 27 | inline bool validate() 28 | { 29 | good_ = !protocol_.empty() && !host_.empty() && !path_.empty(); 30 | for(auto ch : protocol_) { 31 | if(isalnum(ch) == false) { 32 | good_ = false; 33 | break; 34 | } 35 | } 36 | return good_; 37 | } 38 | 39 | int port(int default_port = 0) const; 40 | const std::string& protocol() const; 41 | const std::string& host() const; 42 | const std::string& path() const; 43 | const std::string& param() const; 44 | const std::string& fragment() const; 45 | 46 | static int protocolDefaultPort(const std::string_view& proto); 47 | 48 | bool operator==(const Url& other) const 49 | { 50 | return protocol_ == other.protocol_ 51 | && host_ == other.host_ 52 | && path_ == other.path_ 53 | && param_ == other.param_ 54 | && fragment_ == other.fragment_ 55 | && port_ == other.port_; 56 | } 57 | 58 | bool operator<(const Url& other) const 59 | { 60 | if(protocol_ != other.protocol_) 61 | return protocol_ < other.protocol_; 62 | if(host_ != other.host_) 63 | return host_ < other.host_; 64 | if(path_ != other.path_) 65 | return path_ < other.path_; 66 | if(param_ != other.param_) 67 | return param_ < other.param_; 68 | if(fragment_ != other.fragment_) 69 | return fragment_ < other.fragment_; 70 | if(port_ != other.port_) 71 | return port_ < other.port_; 72 | return false; 73 | } 74 | 75 | protected: 76 | std::string protocol_; 77 | std::string host_; 78 | int port_ = 0; 79 | std::string path_; 80 | std::string param_; 81 | std::string fragment_; 82 | bool good_ = true; 83 | int default_port_; 84 | mutable std::string cache_; 85 | }; 86 | 87 | } -------------------------------------------------------------------------------- /tlgs/server/templates/doc_search.csp: -------------------------------------------------------------------------------- 1 | <%view header %> 2 | 3 | 4 | ### The search algorithm 5 | 6 | TLGS uses a combination of content ranking and link analysis to generate the search result. Put it simply, TlGS first finds all pages in it's index that matches the search query. Then it collects the connection from or to the matched pages. Figures out which pages are more popular. Then, the two information are combined to form the final search result. The technique TLGS uses are designed for the early web - which is exactly what Gemini is designed to be like. 7 | 8 | Here's links if you want to learn more: 9 | 10 | => https://en.wikipedia.org/wiki/Full-text_search Wikipedia: Full-text search 11 | => http://www.cs.technion.ac.il/~moran/r/PS/www9.pdf SALSA: The stochastic approach for link-structure analysis (SALSA) and the TKC effect 12 | 13 | ### Filters 14 | 15 | Filters allows you to, well, filter the search result by different criteria. The following criteria are available 16 | 17 | * content_type 18 | * domain 19 | * size 20 | * intitle 21 | 22 | To apply the filters, just add the filter followed by a colon then the criteria after your search query. Note that you mush supply a search term besides the filter for the search to work. For example 23 | 24 | => /search?Gemini%20content_type%3Atext%2Fgemini Gemini content_type:text/gemini 25 | => /search?UNIX%20content_type%3Atext%2Fplain UNIX content_type:text/plain 26 | => /search?gemini%20domain%3Agemini.circumlunar.space Gemini domain:gemini.circumlunar.space 27 | 28 | And this is not a valid search query. TLGS will ask you to enter the search query again 29 | => /search?content_type:text/plain content_type:text/plain 30 | 31 | The size filter works slightly differently. Instead of searching for a certain size, it searches for files larger or smaller than a size. By default the unit of size is in bytes. You can also suffix the size with units. Currently K, M, G, Ki, Mi, Gi are supported. Be aware that TLGS does not know about the size of binary files (images, iso files, etc..). Thus these files are excluded when the size filter is applied. 32 | 33 | => /search?Gemini%20size%3A%3C2048 Gemini size:<2048 34 | => /search?Gemini%20size%3A%3E5K Gemini size:>5K 35 | => /search?Gemini%20size%3A%3C5KB Gemini size:<5KB 36 | => /search?Gemini%20size%3A%3C0.005MiB Gemini size:<0.005MiB 37 | 38 | The NOT operator negates the proceeding filter. 39 | 40 | => /search?Gemini%20NOT%20content_type%3Atext%2Fgemini Gemini NOT content_type:text/gemini 41 | 42 | Applying filters of the same type effectively ORs them together. While applying multiple filters of different type ANDs them together. 43 | 44 | => /search?Gemini%20content_type%3Atext%2Fgemini%20content_type%3Atext%2Fplain Gemini content_type:text/gemini content_type:text/plain 45 | => /search?Gemini%20content_type%3Atext%2Fgemini%20size%3A%3C5K Gemini content_type:text/gemini size:<5K 46 | 47 | ### Verbose mode 48 | 49 | Verbose mode allows one to gain insight on how TLGS ranks contents and when the page is crawled. You can enable verbose mode on any search page by clicking the "Enter Verbose Mode" link. This will send to a similar page but with more information about the search. You can then click "Exit Verbose Mode" to return to the normal search style. 50 | 51 | You can also enter verbose mode manually via the URL. Prefix any search page with a `/v` to enter. For example: 52 | 53 | => /search?Gemini 54 | => /v/search?Gemini -------------------------------------------------------------------------------- /tlgs/server/templates/search_result.csp: -------------------------------------------------------------------------------- 1 | <%inc 2 | #include "search_result.hpp" 3 | #include 4 | #include 5 | #include 6 | %> 7 | <%view header %> 8 | 9 | <%c++ 10 | auto sanitize_previw = [](std::string_view str) -> std::string { 11 | std::string result; 12 | result.reserve(str.size()); 13 | const std::string_view spaces = " \r\n\t"; 14 | bool is_space = true; 15 | for(auto ch : str) { 16 | if(spaces.find(ch) == std::string_view::npos) 17 | result.push_back(ch); 18 | else if(!is_space) 19 | result.push_back(' '); 20 | is_space = (ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t'); 21 | } 22 | auto idx = result.find_first_not_of("`*=>#"); 23 | if(idx == std::string::npos) 24 | return result; 25 | return result.substr(idx); 26 | }; 27 | auto search_result = @@.get>("search_result"); 28 | bool verbose_mode = @@.get("verbose"); 29 | auto encoded_search_term = @@.get("encoded_search_term"); 30 | size_t current_page = @@.get("current_page_idx")+1; 31 | size_t item_per_page = @@.get("item_per_page"); 32 | size_t total_results = @@.get("total_results"); 33 | size_t max_pages = total_results/item_per_page + (total_results%item_per_page ? 1 : 0); 34 | std::string search_query = @@.get("search_query"); 35 | 36 | if(!verbose_mode) { 37 | std::string search_path = " /v/search/"+std::to_string(current_page)+"?"+encoded_search_term; 38 | $$ << "## Search\n" 39 | << fmt::format("=> {} 📚 Enter verbose search\n", search_path); 40 | } 41 | else { 42 | std::string search_path = " /search/"+std::to_string(current_page)+"?"+encoded_search_term; 43 | $$ << "## Search [verbose]\n" 44 | << fmt::format("=> {} 📚 Exit verbose search\n", search_path); 45 | } 46 | 47 | for(const auto& result : search_result) { 48 | size_t size_in_kb = std::max(result.size / 1000, size_t{1}); 49 | std::string content_type = result.content_type; 50 | if(content_type.empty()) 51 | content_type = "unknown/none"; 52 | auto url = tlgs::Url(result.url); 53 | if(size_in_kb != 0) 54 | $$ << fmt::format("=> {} {} ({}, {}KB)\n", url.str(), result.title, content_type, size_in_kb); 55 | else 56 | $$ << fmt::format("=> {} {} ({})\n", url.str(), result.title, content_type); 57 | if(result.title != url.str()) 58 | $$ << "* " << utils::urlDecode(url.str()) << "\n"; 59 | if(verbose_mode) { 60 | $$ << "* Score: " << result.score << "\n"; 61 | $$ << "* Last Crawled: " << result.last_crawled_at << "\n"; 62 | } 63 | $$ << sanitize_previw(result.preview) << "\n"; 64 | $$ << "\n"; 65 | } 66 | 67 | if(search_result.size() == 0) { 68 | $$ << "> WOW, such emptiness... - Totally not Doge\n" 69 | << "0 search result found\n"; 70 | } 71 | else { 72 | $$ << fmt::format("Page {} of {} ({} results).\n", current_page, max_pages, total_results); 73 | std::string search_path = (verbose_mode ? "/v/search" : "/search"); 74 | 75 | if(current_page != 1) { 76 | auto page_num = current_page-1; 77 | if(page_num != 1) 78 | $$ << fmt::format("=> {}/{}?{} ⬅️ Previous Page\n", search_path, current_page-1, encoded_search_term); 79 | else 80 | $$ << fmt::format("=> {}?{} ⬅️ Previous Page\n", search_path, encoded_search_term); 81 | } 82 | if(current_page != max_pages) 83 | $$ << fmt::format("=> {}/{}?{} ➡️ Next Page\n", search_path, current_page+1, encoded_search_term); 84 | if(max_pages != 1) 85 | $$ << fmt::format("=> {}_jump/{} ↗️ Go to page\n", search_path, encoded_search_term); 86 | } 87 | %> -------------------------------------------------------------------------------- /tlgs/crawler/iconv.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2011, Yuya Unno. 2021, Martin Chang 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the name of the Yuya Unno nor the 14 | names of its contributors may be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY 21 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #pragma once 30 | 31 | #include 32 | #include 33 | 34 | #include 35 | #include 36 | #include 37 | 38 | namespace iconvpp { 39 | 40 | class converter { 41 | public: 42 | converter(const std::string& out_encode, 43 | const std::string& in_encode, 44 | bool ignore_error = false, 45 | size_t buf_size = 1024) 46 | : ignore_error_(ignore_error), 47 | buf_size_(buf_size) { 48 | if (buf_size == 0) { 49 | throw std::runtime_error("buffer size must be greater than zero"); 50 | } 51 | 52 | iconv_t conv = ::iconv_open(out_encode.c_str(), in_encode.c_str()); 53 | if (conv == (iconv_t)-1) { 54 | if (errno == EINVAL) 55 | throw std::runtime_error( 56 | "not supported from " + in_encode + " to " + out_encode); 57 | else 58 | throw std::runtime_error("unknown error"); 59 | } 60 | iconv_ = conv; 61 | } 62 | 63 | ~converter() { 64 | iconv_close(iconv_); 65 | } 66 | 67 | void convert(const std::string_view& input, std::string& output) const { 68 | // copy the string to a buffer as iconv function requires a non-const char 69 | // pointer. 70 | std::vector in_buf(input.begin(), input.end()); 71 | char* src_ptr = &in_buf[0]; 72 | size_t src_size = input.size(); 73 | 74 | std::vector buf(buf_size_); 75 | std::string dst; 76 | while (0 < src_size) { 77 | char* dst_ptr = &buf[0]; 78 | size_t dst_size = buf.size(); 79 | size_t res = ::iconv(iconv_, &src_ptr, &src_size, &dst_ptr, &dst_size); 80 | if (res == (size_t)-1) { 81 | if (errno == E2BIG) { 82 | // ignore this error 83 | } else if (ignore_error_) { 84 | // skip character 85 | ++src_ptr; 86 | --src_size; 87 | } else { 88 | check_convert_error(); 89 | } 90 | } 91 | dst.append(&buf[0], buf.size() - dst_size); 92 | } 93 | dst.swap(output); 94 | } 95 | 96 | private: 97 | void check_convert_error() const { 98 | switch (errno) { 99 | case EILSEQ: 100 | case EINVAL: 101 | throw std::runtime_error("invalid multibyte chars"); 102 | default: 103 | throw std::runtime_error("unknown error"); 104 | } 105 | } 106 | 107 | iconv_t iconv_; 108 | bool ignore_error_; 109 | const size_t buf_size_; 110 | }; 111 | 112 | } // namespace iconvpp 113 | -------------------------------------------------------------------------------- /tlgs/crawler/crawler.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | 12 | class GeminiCrawler : public trantor::NonCopyable 13 | { 14 | public: 15 | using EventLoop = trantor::EventLoop; 16 | template 17 | using Task = drogon::Task; 18 | 19 | GeminiCrawler(EventLoop* loop) : loop_(loop) {} 20 | 21 | /** 22 | * @brief Adds a url to the crawling queue. 23 | * 24 | * @param url the URL to be queued. 25 | */ 26 | 27 | void addUrl(const std::string& url) 28 | { 29 | craw_queue_.push(url); 30 | } 31 | 32 | /** 33 | * @brief Start the crawler. 34 | * @note This function is async and returns immediately. 35 | * 36 | */ 37 | void start() 38 | { 39 | dispatchCrawl(); 40 | } 41 | 42 | /** 43 | * @brief co_returns when the crawler is done. 44 | */ 45 | Task awaitEnd() 46 | { 47 | int finish_count = 0; 48 | while(true) { 49 | co_await drogon::sleepCoro(loop_, 0.2); 50 | if(ended_) 51 | break; 52 | 53 | // HACK: Sometimes ended_ is not set. Workaround when we see 5 conseqitive 0 crawlings 54 | if(ongoing_crawlings_ == 0) { 55 | finish_count ++; 56 | loop_->runInLoop([this] {dispatchCrawl();}); 57 | } 58 | else 59 | finish_count = 0; 60 | if(finish_count == 5) { 61 | ended_ = true; 62 | break; 63 | } 64 | } 65 | co_return; 66 | } 67 | 68 | /** 69 | * @brief Crawl all the pages 70 | * 71 | * @return Task awaiter for the end of the crawl. 72 | */ 73 | Task crawlAll() 74 | { 75 | dispatchCrawl(); 76 | return awaitEnd(); 77 | } 78 | 79 | void setMaxConcurrentConnections(size_t n) 80 | { 81 | max_concurrent_connections_ = n; 82 | } 83 | 84 | size_t maxConcurrentConnections() const 85 | { 86 | return max_concurrent_connections_; 87 | } 88 | 89 | void enableForceReindex(bool enable=true) 90 | { 91 | force_reindex_ = enable; 92 | } 93 | protected: 94 | /** 95 | * @brief Launches up to max_concurrent_connections_ concurrent crawler tasks (not threads) 96 | * to fetch content from URLs. This function is async and returns immediately. 97 | * 98 | * @note await on `awaiteEnd()` to wait for all the tasks to finish. 99 | * @note This function continues launch tasks until all the URLs are crawled. 100 | */ 101 | void dispatchCrawl(); 102 | /** 103 | * @brief Should the crawler crawl this URL? Checks against robots.txt and an internel 104 | * blacklist. 105 | * @param url The URL to check. 106 | * 107 | * @return true if the crawler should crawl this URL. 108 | */ 109 | Task shouldCrawl(std::string url_str); 110 | /** 111 | * 112 | * @brief Get the next URL that the crawler should crawl. 113 | * @param url The URL to crawl. 114 | * 115 | * @return std::nullopt if no URL is available. 116 | */ 117 | Task> getNextCrawlPage(); 118 | Task> getNextPotentialCarwlUrl(); 119 | /** 120 | * @brief Crawl the given URL. Then add the content found in that URL to the DB 121 | * 122 | * @param url_str the URL to crawl 123 | */ 124 | Task crawlPage(const std::string& url_str); 125 | 126 | EventLoop* loop_; 127 | tbb::concurrent_unordered_map host_timeout_count_; 128 | tbb::concurrent_queue craw_queue_; 129 | size_t max_concurrent_connections_ = 1; 130 | std::atomic ongoing_crawlings_ = 0; 131 | std::atomic ended_ = false; 132 | bool force_reindex_ = false; 133 | }; 134 | -------------------------------------------------------------------------------- /tlgs/server/templates/doc_indexing.csp: -------------------------------------------------------------------------------- 1 | <%view header %> 2 | 3 | ### What does TLGS index? 4 | 5 | As of now, TLGS only indexes textual content withing Gemini. And it ignores all links to other protocols like HTTP and Gopher. It only parses and follows links withing `text/gemini` pages. Thus links withing other textual pages are ignored. Though contents in `text/plain`, `text/markdown`, `text/x-rst` and `plaintext` are still indexed. 6 | 7 | Textual pages over 2.5MB are not indexed nor links within are followed. And we only fetch headers of binary files to make sure the file exists. Then it's URL is directly added to the index. If you see broken connections requesting binary files. That's likely TLGS checking if the file exists. 8 | 9 | Please understand that TLGS allows manually excluding capsules from it's index. That the maintainer utilizes to prevent TLGS from crawling sites that causes issues with ranking or other functionalities. Please don't take it personally or think it is censorship if your content does not get indexed. TLGS although designed to be scalable and efficient, does run on fairly limited resources. We promise to keep working and make the indexing more efficient and resilient. 10 | 11 | Currently especially contents of the following types is excluded: 12 | 13 | * Mirrors of large websites from the common Web such as social media, Wikipedia (too big to be added to our index) 14 | * Mirrors of news aggrigrates from the common Web (again, too big to be added) 15 | * Git commit histories, RFC mirrors and other web archives 16 | 17 | With that said. We do index individual news mirror as long as they are not overly large. 18 | 19 | ### Does TLGS index the entirity of my page? 20 | 21 | Yes... but not really 100%. To provide better search result. After retrieving the page. TLGS removes ASCII arts and content separators from the page before indexing. There will be false positives in the detection despite our efforts. Causing some code segments (especially esoteric languages like BrainF**k) be removed from the index. Thus almost everything relevant will be indexed. 22 | 23 | ### robots.txt support 24 | 25 | TLGS supports the simplified robots.txt specification for Gemini. To prevent crawling pages of your site, place a "robots.txt" file in your capsule's root directory. TLGS will fetch and parse the file before crawling your site. The file should have a content type of `text/plain` and is encoded in UTF-8, without BOM. The file is cached by TLGS for 7 days. TLGS will reuse the file within the caching period to reduce bandwidth consumption and speedup crawling. 26 | 27 | => gemini://gemini.circumlunar.space/docs/companion/robots.gmi robots.txt for Gemini 28 | 29 | TLGS follows the following user agents: 30 | * tlgs 31 | * indexer 32 | * * 33 | 34 | Note that TLGS's crawler does not support any of the following extended features common on the Web. And we only support the * and $ special characters. 35 | 36 | * Allow directives 37 | * Crawl-Delay directives 38 | 39 | ### How can I recognize requests from TLGS? 40 | 41 | You can't. Gemini does not provide a way for TLGS (or any client in that matter) to notify the server about the implementation and/or intent. 42 | 43 | ### Does TLGS keeps the data forever? 44 | 45 | No. TLGS will automatically remove a page from it's index after 1 month of failing to re-retrieve it (ex: the server was removed from the internet, the pages was deleted, etc..). 46 | 47 | ## Details 48 | 49 | ### Redirection 50 | 51 | TLGS allows up to 5 redirections before giving up. If at any point the redirection is not to a Gemini URL or points to somewhere blocked by robots.txt. TLGS will stop following the redirection chain and consider the page as not found. 52 | 53 | ### Special case: 53 Proxy Request Refused 54 | 55 | TLGS does not care about the returned status code. It only accepts 20 for success and 3X for redirection. All other status means resource not avaliable to TLGS. 53 is the only exception. Martin (the author of TLGS) had a discussion with a Gemini user. Due to some weird quarks (compatablity, etc..) crawlers often ignore certificate errors. Leading to capsules being crawled on the wrong domain. Status 53 is used to tell TLGS that it is requsting from thewrong domain. TLGS will then stop crawling said page within a 21 day period. -------------------------------------------------------------------------------- /tlgsutils/tests/url_parser_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | DROGON_TEST(UrlParserTest) 5 | { 6 | auto url = tlgs::Url("http://example.com"); 7 | CHECK(url.good() == true); 8 | CHECK(url.protocol() == "http"); 9 | CHECK(url.host() == "example.com"); 10 | CHECK(url.port() == 80); // guessed from the protocol tring 11 | CHECK(url.path() == "/"); 12 | CHECK(url.param() == ""); 13 | 14 | url = tlgs::Url("http:/example.com"); 15 | CHECK(url.good() == false); 16 | 17 | url = tlgs::Url("$!@://example.com"); 18 | CHECK(url.good() == false); 19 | 20 | url = tlgs::Url("./example.com"); 21 | CHECK(url.good() == false); 22 | 23 | url = tlgs::Url("http://example.com:ababababa"); 24 | CHECK(url.good() == false); 25 | 26 | url = tlgs::Url("http://example.com:/"); 27 | CHECK(url.good() == false); 28 | 29 | url = tlgs::Url("://example.com"); 30 | CHECK(url.good() == false); 31 | 32 | url = tlgs::Url(""); 33 | CHECK(url.good() == false); 34 | 35 | url = tlgs::Url("gemini://localhost/index.gmi?15"); 36 | CHECK(url.good() == true); 37 | CHECK(url.protocol() == "gemini"); 38 | CHECK(url.host() == "localhost"); 39 | CHECK(url.port() == 1965); 40 | CHECK(url.path() == "/index.gmi"); 41 | CHECK(url.param() == "15"); 42 | CHECK(url.str() == "gemini://localhost/index.gmi?15"); 43 | 44 | url = tlgs::Url("gemini://localhost:1965/index.gmi?15"); 45 | CHECK(url.good() == true); 46 | CHECK(url.str() == "gemini://localhost/index.gmi?15"); 47 | 48 | url = tlgs::Url("gemini://localhost:1966"); 49 | CHECK(url.good() == true); 50 | CHECK(url.port() == 1966); 51 | CHECK(url.path() == "/"); 52 | CHECK(url.str() == "gemini://localhost:1966/"); 53 | 54 | url = tlgs::Url("GEMINI://localhost"); 55 | CHECK(url.good() == true); 56 | CHECK(url.protocol() == "gemini"); 57 | 58 | url = tlgs::Url("gemini://localhost/a/../b"); 59 | CHECK(url.good() == true); 60 | CHECK(url.path() == "/b"); 61 | 62 | url = tlgs::Url("gemini://localhost:1965/aaa"); 63 | CHECK(url.good() == true); 64 | CHECK(url.path() == "/aaa"); 65 | CHECK(url.param() == ""); 66 | CHECK(url.fragment() == ""); 67 | CHECK(url.str() == "gemini://localhost/aaa"); 68 | 69 | url = tlgs::Url("gemini://localhost/aaa#123"); 70 | CHECK(url.good() == true); 71 | CHECK(url.path() == "/aaa"); 72 | CHECK(url.fragment() == "123"); 73 | CHECK(url.str() == "gemini://localhost/aaa#123"); 74 | 75 | url = tlgs::Url("gemini://localhost/aaa?456#123"); 76 | CHECK(url.good() == true); 77 | CHECK(url.path() == "/aaa"); 78 | CHECK(url.param() == "456"); 79 | CHECK(url.fragment() == "123"); 80 | CHECK(url.str() == "gemini://localhost/aaa?456#123"); 81 | 82 | url = tlgs::Url("gemini://localhost/aaa#123?456"); 83 | CHECK(url.good() == true); 84 | CHECK(url.path() == "/aaa"); 85 | CHECK(url.param() == ""); 86 | CHECK(url.fragment() == "123?456"); 87 | 88 | url = tlgs::Url("gemini://.smol.pub/"); 89 | CHECK(url.good() == false); 90 | 91 | // no protocol, i.e. use the same protocol as the current page 92 | url = tlgs::Url("//smol.pub/"); 93 | CHECK(url.good() == true); 94 | CHECK(url.protocol() == ""); 95 | CHECK(url.host() == "smol.pub"); 96 | CHECK(url.path() == "/"); 97 | CHECK(url.str() == "//smol.pub/"); 98 | 99 | url = tlgs::Url("//smol.pub/123456"); 100 | CHECK(url.good() == true); 101 | CHECK(url.protocol() == ""); 102 | CHECK(url.host() == "smol.pub"); 103 | CHECK(url.path() == "/123456"); 104 | CHECK(url.str() == "//smol.pub/123456"); 105 | 106 | url = tlgs::Url("://"); 107 | CHECK(url.good() == false); 108 | 109 | url = tlgs::Url("geini://example.com:65536/"); 110 | CHECK(url.good() == false); 111 | 112 | url = tlgs::Url("geini://example.com:-1/"); 113 | CHECK(url.good() == false); 114 | 115 | url = tlgs::Url("geini://example.com:0/"); 116 | CHECK(url.good() == false); 117 | 118 | url = tlgs::Url("gemini/example://example.com/"); 119 | CHECK(url.good() == false); 120 | 121 | url = tlgs::Url("/example.com//gemini://example.com/"); 122 | CHECK(url.good() == false); 123 | 124 | url = tlgs::Url("example.com//example.com"); 125 | CHECK(url.good() == false); 126 | 127 | url = tlgs::Url("http://gemini://example.com"); 128 | CHECK(url.good() == false); 129 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TLGS - Totally Legit Gemini Search 2 | 3 | ## Overview 4 | 5 | TLGS is a search engine for Gemini. It's slightly overengineered for what it currently is and uses weird tech. And I'm proud of that. The current code basse is kinda messy - I promise to clean them up. The main features/characteristics are as follows: 6 | 7 | * Using the state of the art C++20 8 | * Parses and indexes textual contents on Gemninispace 9 | * Highly concurrent and asynchronous 10 | * Stores index on PostgreSQL 11 | * Developed for Linux. But should work on Windows, OpenBSD, HaikuOS, macOS, etc.. 12 | * Only fetch headers for files it can't index to save bandwith and time 13 | * Handles all kinds of source encoding 14 | * Link analysis using the SALSA algorithm 15 | 16 | As of now, indexing of news sites, RFCs, documentations are mostly disabled. But likely be enabled once I have the mean and resources to scale the setup. 17 | 18 | ## Using this project 19 | 20 | ### Requirments 21 | 22 | * [drogon](https://github.com/an-tao/drogon) 23 | * [nlohmann-json](https://github.com/nlohmann/json) 24 | * [CLI11](https://github.com/CLIUtils/CLI11) 25 | * [libfmt](https://github.com/fmtlib/fmt) 26 | * [TBB](https://github.com/oneapi-src/oneTBB) 27 | * [xxHash](https://github.com/Cyan4973/xxHash) 28 | * iconv 29 | * PostgreSQL 30 | 31 | ### Building and running the project 32 | 33 | To build the project. You'll need a fully C++20 capable compiler. The following compilers should work as of writing this README 34 | 35 | * GCC >= 11.2 36 | * MSVC >= 16.25 37 | 38 | Install all dependencies. And run the commands: 39 | 40 | ```bash 41 | mkdir build 42 | cd build 43 | cmake .. 44 | make -j 45 | ``` 46 | 47 | ### Creating and maintaining the index 48 | 49 | To create the inital index: 50 | 51 | 1. Initialize the database `./tlgs/tlgs_ctl/tlgs_ctl ../tlgs/config.json populate_schema` 52 | 2. Place the seed URLs into `seeds.text` 53 | 3. In the build folder, run `./tlgs/crawler/tlgs_crawler -s seeds.text -c 4 ../tlgs/config.json` 54 | 55 | Now the crawler will start crawling the geminispace while also updating outdated indices (if any). To update an existing index. Run: 56 | 57 | ```bash 58 | ./tlgs/crawler/tlgs_crawler -c 2 ../tlgs/config.json 59 | # -c is the maximum concurrent connections the crawler will make 60 | ``` 61 | 62 | **NOTE:** TLGS's crawler is distributable. You can run multiple instances in parallel. But some intances may drop out early towards the end or crawling. Though it does not effect the result of crawling. 63 | 64 | ### Running the capsule 65 | 66 | ```bash 67 | openssl req -new -subj "/CN=my.host.name.space" -x509 -newkey ec -pkeyopt ec_paramgen_curve:prime256v1 -days 36500 -nodes -out cert.pem -keyout key.pem 68 | cd tlgs/server 69 | ./tlgs_server ../../../tlgs/server_config.json 70 | ``` 71 | 72 | ### Via systemd 73 | 74 | ```bash 75 | sudo systemctl start tlgs_server 76 | sudo systemctl start tlgs_crawler 77 | ``` 78 | 79 | ## Server config 80 | 81 | The `custom_config.tlgs` section in `search_config.json` (installed at `/etc/tlgs/server_config.json`) contains confgurations for TLGS server. Besides the usual [Drogon's config options](https://drogon.docsforge.com/master/configuration-file/). custom_config changes the property of TLGS itself. Current supported options are: 82 | 83 | ### ranking_algo 84 | The ranking algorithm TLGS uses to rank pages in search result. The ranking is then combined with the text match score to produce the final search rank. Current supported values are `hits` and `salsa`. Refering to the [HITS][hits] and [SALSA][salsa] ranking algorithm. It defaults to `salsa` if no value is provided. 85 | 86 | SALSA runs slightly faster than HITS for large search results. Both [literature][najork2007comparing] and imperical experience suggests SALSA provides better ranking. Thus we switched from HITS to SALSA. 87 | 88 | ```json 89 | "ranking_algo": "salsa" 90 | ``` 91 | 92 | ## TODOs 93 | 94 | - [ ] Code cleanup 95 | - [ ] I really need to centralized the crawling logic 96 | - [x] Randomize the order of crawling. Avoid bashing a single capsule 97 | * Sort of.. by sampling the pages table with low percentage and increase later 98 | - [ ] Support parsing markdown 99 | - [ ] Try indexing news sites 100 | - [ ] Optimize the crawler even more 101 | - [x] Checks hash before updating index 102 | - [ ] Peoper UTF-8 handling in ASCII art detection 103 | - [x] Use a trie for blacklist URL match 104 | - [x] Link analysis using SALSA 105 | - [ ] BM25 for text scoring 106 | - [x] Dedeuplicate search result 107 | - [x] Impement Filters 108 | - [ ] Proper(?) way to migrate schema 109 | 110 | [hits]: http://www.cs.cornell.edu/home/kleinber/auth.pdf 111 | [salsa]: https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.38.5859 112 | [najork2007comparing]: https://www.ccs.neu.edu/home/vip/teach/IRcourse/4_webgraph/notes/najork05_HITS_vs_salsa.pdf -------------------------------------------------------------------------------- /tlgsutils/robots_txt_parser.cpp: -------------------------------------------------------------------------------- 1 | #include "robots_txt_parser.hpp" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | std::vector tlgs::parseRobotsTxt(const std::string& str, const std::set& agents) 10 | { 11 | std::string lf_str = str; 12 | if(str.find("\r\n") != std::string::npos) 13 | drogon::utils::replaceAll(lf_str, "\r\n", "\n"); 14 | 15 | std::set disallowed_path; 16 | std::stringstream ss(lf_str); 17 | static const std::regex line_re(R"([ \t]*(.*):[ \t]*(.*))"); 18 | std::smatch match; 19 | std::string line; 20 | bool care = true; 21 | bool last_line_user_agent = false; 22 | while(std::getline(ss, line)) { 23 | if(!std::regex_match(line, match, line_re)) 24 | continue; 25 | 26 | std::string key = match[1]; 27 | // convert to lowercase 28 | std::transform(key.begin(), key.end(), key.begin(), ::tolower); 29 | if(key == "user-agent") { 30 | std::string agent = match[2]; 31 | if(last_line_user_agent) { 32 | care |= agents.count(agent) > 0; 33 | } 34 | else { 35 | care = agents.count(agent) > 0; 36 | } 37 | last_line_user_agent = true; 38 | } 39 | else { 40 | last_line_user_agent = false; 41 | } 42 | 43 | if(key == "disallow" && care == true) { 44 | std::string path = match[2]; 45 | if(path.empty()) 46 | disallowed_path.clear(); 47 | else 48 | disallowed_path.insert(path); 49 | } 50 | } 51 | return std::vector(disallowed_path.begin(), disallowed_path.end()); 52 | } 53 | 54 | 55 | /** 56 | * @brief Fast matching for common cases. Otherwise, use the regex. Fast cases include 57 | * 1. No wildcard 58 | * 2. Starts with * 59 | * 3. Ends with * 60 | * 4. Starts with * and ends with $ 61 | * 5. Starts with * and ends with * 62 | * 6. Contains * in the middle 63 | * 64 | * @param pattern the wildcard pattern 65 | * @param str the string to match 66 | */ 67 | static bool wildcardPathMatch(std::string pattern, const std::string_view& str) 68 | { 69 | if(pattern.empty()) 70 | return false; 71 | 72 | size_t star_count = std::count(pattern.begin(), pattern.end(), '*'); 73 | if(star_count == 0) { 74 | return str == pattern || str == pattern+"/" 75 | || (str.size() > pattern.size()+1 && str.starts_with(pattern) && (str[pattern.size()] == '/' || pattern.back() == '/')); 76 | } 77 | 78 | if(pattern.back() == '$' && (pattern.starts_with("*") || pattern.starts_with("/*"))) 79 | pattern.pop_back(); 80 | 81 | // */foo/bar/* 82 | if(pattern[0] == '*' && pattern.back() == '*' && star_count == 2) 83 | return str.find(pattern.substr(1, pattern.size()-2)) != std::string_view::npos; 84 | // /*/foo/* 85 | if(pattern.starts_with("/*") && pattern.back() == '*' && star_count == 2) 86 | return str.find(pattern.substr(2, pattern.size()-3)) != std::string_view::npos; 87 | // *fooo... 88 | if(pattern[0] == '*' && star_count == 1) 89 | return str.ends_with(pattern.substr(1)); 90 | // /*foo... 91 | if(pattern.starts_with("/*") && star_count == 1) 92 | return str.ends_with(pattern.substr(2)); 93 | // foo* 94 | if(pattern.back() == '*' && star_count == 1) 95 | return str.starts_with(pattern.substr(0, pattern.size() - 1)); 96 | 97 | // Now * must be in the middle. 98 | auto n = pattern.find("*"); 99 | if(n != std::string_view::npos && star_count == 1) 100 | return str.starts_with(pattern.substr(0, n)) && str.rfind(pattern.substr(n+1)) > n; 101 | 102 | // Else we convert the pattern to a regex and try to match 103 | std::string regex_pattern; 104 | const std::string_view escape_chars = "\\.+()[]{}|"; 105 | for(char ch : pattern) { 106 | if(ch == '*') 107 | regex_pattern += ".*"; 108 | else if(escape_chars.find(ch) != std::string_view::npos) 109 | regex_pattern += "\\" + std::string(1, ch); 110 | else 111 | regex_pattern += ch; 112 | } 113 | try { 114 | std::regex re(regex_pattern); 115 | std::smatch sm; 116 | std::string s(str); 117 | return std::regex_match(s, sm, re); 118 | } 119 | catch(std::regex_error& e) { 120 | return false; 121 | } 122 | } 123 | 124 | bool tlgs::isPathBlocked(const std::string& path, const std::vector& disallowed_paths) 125 | { 126 | for(const auto& disallowed : disallowed_paths) { 127 | if(wildcardPathMatch(disallowed, path)) 128 | return true; 129 | } 130 | return false; 131 | } 132 | 133 | bool tlgs::isPathBlocked(const std::string& path, const std::string& disallowed_path) 134 | { 135 | return wildcardPathMatch(disallowed_path, path); 136 | } 137 | -------------------------------------------------------------------------------- /tlgsutils/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.hpp" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | static std::string charToHex(char c) 10 | { 11 | std::string result; 12 | char first, second; 13 | 14 | first = (c & 0xF0) / 16; 15 | first += first > 9 ? 'A' - 10 : '0'; 16 | second = c & 0x0F; 17 | second += second > 9 ? 'A' - 10 : '0'; 18 | 19 | result.append(1, first); 20 | result.append(1, second); 21 | 22 | return result; 23 | } 24 | 25 | bool tlgs::isAsciiArt(const std::string& str) 26 | { 27 | // detection algorithm 2.A from https://www.w3.org/WAI/ER/IG/ert/AsciiArt.htm 28 | size_t count = 0; 29 | char last_ch = 0; 30 | for(auto ch : str) { 31 | if(ch == last_ch) 32 | count++; 33 | else { 34 | count = 1; 35 | last_ch = ch; 36 | } 37 | 38 | if(count >= 4 && ch != ' ' && ch != '\t') 39 | return true; 40 | } 41 | 42 | // Characters I'm sure is not used in code 43 | if(str.find("☆") != std::string::npos 44 | || str.find("★") != std::string::npos 45 | || str.find("░") != std::string::npos 46 | || str.find("█") != std::string::npos 47 | || str.find("⣿") != std::string::npos 48 | || str.find("⡇") != std::string::npos 49 | || str.find("⢀") != std::string::npos 50 | || str.find("┼") != std::string::npos 51 | || str.find("╭") != std::string::npos) 52 | return true; 53 | 54 | // patterns that's definatelly not normal text 55 | if(str.find("(_-<") != std::string::npos || 56 | str.find("_ _ _ _") != std::string::npos) 57 | return true; 58 | 59 | return false; 60 | } 61 | 62 | std::string tlgs::urlEncode(const std::string_view src) 63 | { 64 | std::string result; 65 | result.reserve(src.size() + 8); // Some sane amount to reduce allocation 66 | // Unserved symbols. See RFC 3986 67 | constexpr std::string_view symbols = "-_.~"; 68 | 69 | for (char ch : src) 70 | { 71 | if (ch == ' ') 72 | result.append(1, '+'); 73 | else if (isalnum(ch) || symbols.find(ch) != std::string_view::npos) 74 | result.append(1, ch); 75 | else 76 | { 77 | result.append(1, '%'); 78 | result.append(charToHex(ch)); 79 | } 80 | } 81 | 82 | return result; 83 | } 84 | 85 | tlgs::Url tlgs::linkCompose(const tlgs::Url& url, const std::string& path) 86 | { 87 | assert(path.size() != 0); 88 | tlgs::Url link_url; 89 | if(path[0] == '/') { 90 | tlgs::Url dummy = tlgs::Url("gemini://localhost"+path); 91 | link_url = tlgs::Url(url).withPath(dummy.path()).withParam(dummy.param()).withFragment(dummy.fragment()); 92 | } 93 | else { 94 | tlgs::Url dummy = tlgs::Url("gemini://localhost/"+path, false); 95 | auto link_path = std::filesystem::path(dummy.path().substr(1)); 96 | auto current_path = std::filesystem::path(url.path()); 97 | 98 | if(url.path().back() == '/') // we are visiting a directory 99 | link_url = tlgs::Url(url).withPath((current_path/link_path).generic_string()); 100 | else 101 | link_url = tlgs::Url(url).withPath((current_path.parent_path()/link_path).generic_string()); 102 | link_url.withParam(dummy.param()).withFragment(dummy.fragment()); 103 | } 104 | return link_url; 105 | } 106 | 107 | bool tlgs::isNonUriAction(const std::string& str) 108 | { 109 | // detects if the string is of the form [scheme]:[other_stuff] 110 | size_t n = str.find("://"); 111 | if(n != std::string::npos) 112 | return false; 113 | n = str.find(":"); 114 | for(size_t i = 0; i < n; i++) { 115 | if(isalpha(str[i]) == false) 116 | return false; 117 | } 118 | return true; 119 | } 120 | 121 | std::string tlgs::xxHash64(const std::string_view str) 122 | { 123 | auto hash = XXH64(str.data(), str.size(), 0); 124 | return drogon::utils::binaryStringToHex((unsigned char*)&hash, sizeof(hash)); 125 | } 126 | 127 | std::optional tlgs::try_strtoull(const std::string& str) 128 | { 129 | char* endptr; 130 | unsigned long long result = strtoull(str.c_str(), &endptr, 10); 131 | if(endptr == str.c_str() || *endptr != '\0') 132 | return std::nullopt; 133 | return result; 134 | } 135 | 136 | std::string tlgs::indexFriendly(const tlgs::Url& url) 137 | { 138 | std::string result = url.str(); 139 | drogon::utils::replaceAll(result, "_", " "); 140 | drogon::utils::replaceAll(result, "-", " "); 141 | drogon::utils::replaceAll(result, "~", " "); 142 | 143 | return result; 144 | } 145 | 146 | std::string tlgs::pgSQLRealEscape(std::string str) 147 | { 148 | drogon::utils::replaceAll(str, "\\", "\\\\"); 149 | drogon::utils::replaceAll(str, std::string(1, '\0'), "\\0"); 150 | drogon::utils::replaceAll(str, "\n", "\\n"); 151 | drogon::utils::replaceAll(str, "\r", "\\r"); 152 | drogon::utils::replaceAll(str, "'", "''"); 153 | drogon::utils::replaceAll(str, "\"", "\\\""); 154 | drogon::utils::replaceAll(str, "\x1a", "\\Z"); 155 | return str; 156 | } 157 | -------------------------------------------------------------------------------- /tlgs/tlgs_ctl/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | using namespace drogon; 4 | 5 | #include "CLI/App.hpp" 6 | #include "CLI/Formatter.hpp" 7 | #include "CLI/Config.hpp" 8 | 9 | Task<> createDb() 10 | { 11 | auto db = app().getDbClient(); 12 | co_await db->execSqlCoro(R"( 13 | CREATE TABLE IF NOT EXISTS public.pages ( 14 | url text NOT NULL, 15 | domain_name text NOT NULL, 16 | port integer NOT NULL, 17 | content_type text, 18 | charset text, 19 | lang text, 20 | title text, 21 | content_body text, 22 | feed_type TEXT, 23 | size bigint DEFAULT 0 NOT NULL, 24 | last_indexed_at timestamp without time zone, 25 | last_crawled_at timestamp without time zone, 26 | last_crawl_success_at timestamp without time zone, 27 | last_status integer, 28 | last_meta text, 29 | first_seen_at timestamp without time zone NOT NULL, 30 | search_vector tsvector, 31 | cross_site_links json, 32 | internal_links json, 33 | title_vector tsvector, 34 | last_queued_at timestamp without time zone, 35 | indexed_content_hash text NOT NULL default '', 36 | raw_content_hash text NOT NULL default '', 37 | PRIMARY KEY (url) 38 | ); 39 | )"); 40 | co_await db->execSqlCoro("CREATE INDEX IF NOT EXISTS last_crawled_index ON public.pages USING btree (last_crawled_at DESC);"); 41 | co_await db->execSqlCoro("CREATE INDEX IF NOT EXISTS search_vector_index ON public.pages USING gin (search_vector);"); 42 | 43 | co_await db->execSqlCoro(R"( 44 | CREATE TABLE IF NOT EXISTS public.links ( 45 | url text NOT NULL, 46 | host text NOT NULL, 47 | port integer NOT NULL, 48 | to_url text NOT NULL, 49 | to_host text NOT NULL, 50 | to_port integer NOT NULL, 51 | is_cross_site boolean NOT NULL 52 | ); 53 | )"); 54 | co_await db->execSqlCoro("CREATE INDEX IF NOT EXISTS is_cross_site_index ON public.links USING btree (is_cross_site);"); 55 | co_await db->execSqlCoro("CREATE INDEX source_url_index ON public.links USING btree (url);"); 56 | co_await db->execSqlCoro("CREATE INDEX to_url_index ON public.links USING btree (to_url);"); 57 | 58 | co_await db->execSqlCoro(R"( 59 | CREATE TABLE IF NOT EXISTS public.robot_policies ( 60 | host text NOT NULL, 61 | port integer NOT NULL, 62 | disallowed text NOT NULL 63 | ); 64 | )"); 65 | co_await db->execSqlCoro("CREATE INDEX IF NOT EXISTS host_port_index ON public.robot_policies USING btree (host, port);"); 66 | 67 | co_await db->execSqlCoro(R"( 68 | CREATE TABLE IF NOT EXISTS public.robot_policies_status ( 69 | host text NOT NULL, 70 | port integer NOT NULL, 71 | last_crawled_at timestamp without time zone NOT NULL, 72 | have_policy boolean NOT NULL, 73 | PRIMARY KEY (host, port) 74 | ); 75 | )"); 76 | 77 | co_await db->execSqlCoro(R"( 78 | CREATE TABLE IF NOT EXISTS public.perma_redirects ( 79 | from_url text NOT NULL, 80 | to_url text NOT NULL, 81 | PRIMARY KEY (from_url) 82 | ); 83 | )"); 84 | app().quit(); 85 | } 86 | 87 | Task<> purgePage(std::string url) 88 | { 89 | auto db = app().getDbClient(); 90 | auto page = co_await db->execSqlCoro("DELETE FROM pages WHERE url like $1;", url); 91 | co_await db->execSqlCoro("DELETE FROM links WHERE url like $1;", url); 92 | co_await db->execSqlCoro("DELETE FROM links WHERE to_url like $1;", url); 93 | std::cout << "Deleted " << page.affectedRows() << " pages from index" << std::endl; 94 | app().quit(); 95 | } 96 | 97 | Task<> indexStatus() 98 | { 99 | auto db = app().getDbClient(); 100 | auto domain_pages = co_await db->execSqlCoro("SELECT COUNT(DISTINCT LOWER(domain_name)) AS domain_count, COUNT(*) AS count " 101 | "FROM pages WHERE content_body IS NOT NULL"); 102 | auto pages_need_update = co_await db->execSqlCoro("SELECT COUNT(*) AS count FROM pages WHERE " 103 | "last_crawled_at < CURRENT_TIMESTAMP - INTERVAL '3' DAY OR last_crawled_at IS NULL"); 104 | std::cout << domain_pages[0]["domain_count"].as() << " domains in index\n"; 105 | std::cout << domain_pages[0]["count"].as() << " pages in index\n"; 106 | std::cout << pages_need_update[0]["count"].as() << " pages need update\n"; 107 | app().quit(); 108 | } 109 | 110 | int main(int argc, char** argv) 111 | { 112 | std::string config_file = "/etc/tlgs/config.json"; 113 | CLI::App cli{"TLGS Utiltity"}; 114 | 115 | CLI::App& populate_schema = *cli.add_subcommand("populate_schema", "Populate/update database schema"); 116 | 117 | CLI::App& purge = *cli.add_subcommand("purge", "Remove page from database"); 118 | std::string url; 119 | purge.add_option("purge_url", url, "URL to purge (SQL wildcards allowed)"); 120 | 121 | CLI::App& index_status = *cli.add_subcommand("indexstatus", "Show status of the index"); 122 | 123 | cli.add_option("config_file", config_file, "Path to TLGS config file"); 124 | CLI11_PARSE(cli, argc, argv); 125 | 126 | app().loadConfigFile(config_file); 127 | 128 | if(populate_schema) { 129 | app().getLoop()->queueInLoop(async_func(createDb)); 130 | } 131 | else if(purge) { 132 | app().getLoop()->queueInLoop(async_func(std::bind(purgePage, url))); 133 | } 134 | else if(index_status) { 135 | app().getLoop()->queueInLoop(async_func(indexStatus)); 136 | } 137 | else { 138 | std::cout << cli.help(); 139 | return 0; 140 | } 141 | 142 | app().run(); 143 | } -------------------------------------------------------------------------------- /tlgsutils/tests/utils_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | DROGON_TEST(AsciiArtDetectorTest) 5 | { 6 | std::string s = R"( 7 | .-----------------. 8 | | .---------------. | 9 | | | _________ | | 10 | | | |___ ___| | | 11 | | | | | | | 12 | | | | | | | 13 | | | _| |_ | | 14 | | | |_____| | | 15 | | | | | 16 | | '---------------' | 17 | '-----------------')"; 18 | 19 | CHECK(tlgs::isAsciiArt(s) == true); 20 | 21 | s = R"|( 22 | . . . . . . . . + . 23 | . : . .. :. .___---------___. 24 | . . . . :.:. _".^ .^ ^. '.. :"-_. . 25 | : . . .:../: . .^ :.:\. 26 | . . :: +. :.:/: . . . . . .:\ 27 | . . _ :::/: . ^ . . .:\ 28 | . . . - : :.:./. . .:\ 29 | . . . ::. ::\( . :)| 30 | . : . : .:.|. ###### .#######::/ 31 | . :- : .: ::|.####### ..########:| 32 | . .. . .. :\ ######## :######## :/ 33 | .+ :: : -.:\ ######## . ########.:/ 34 | .+ . . . . :.:\. ####### #######..:/ 35 | :: . . . . ::.:..:.\ . . ..:/ 36 | . . .. : -::::.\. | | . .:/ 37 | . : . . .-:.":.::.\ ..:/ 38 | -. . . . .: .:::.:.\. .:/ 39 | . : : ....::_:..:\ ___. :/ 40 | . . .:. .. . .: :.:.:\ :/ 41 | + . . : . ::. :.:. .:.|\ .:/| 42 | . + . . ...:: ..| --.:| 43 | . . . . . . ... :..:.."( ..)" 44 | . . : . .: ::/ . .::\ 45 | )|"; 46 | CHECK(tlgs::isAsciiArt(s) == true); 47 | 48 | s = R"( 49 | __ 50 | .-----.|__|.-----.-----. 51 | | _ || || | _ | 52 | | __||__||__|__|___ | 53 | |__| station |_____| 54 | )"; 55 | CHECK(tlgs::isAsciiArt(s) == true); 56 | 57 | s = R"( 58 | ███████████████████▎██████▎▅████████████████▅ 59 | ███████████████████▎██████▎███▎▎ █████████ ▎ 60 | █████▎━━━━━━███████▎██████▎▀████████████████▀ ▎ 61 | █████▎▎ ███████▎██████▎▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅ 62 | █████▎▎mieum███████▎██████▎██████████████████ ▎ 63 | ███████████████████▎██████▎▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅ 64 | ███████████████████▎██████▎██████████████████ ▎ 65 | ███████████████████▎██████▎█████████▎▎ ██ ▎ 66 | ███████████████████▎██████▎██████████████████ ▎ 67 | ━━━━━━━━━━━━━━━━━━━ ━━━━━━ ━━━━━━━━━━━━━━━━━━ 68 | )"; 69 | CHECK(tlgs::isAsciiArt(s) == true); 70 | } 71 | 72 | DROGON_TEST(LinkCompositionTest) 73 | { 74 | auto url = tlgs::Url("gemini://localhost/"); 75 | std::string path = "/dir"; 76 | CHECK(tlgs::linkCompose(url, path).str() == "gemini://localhost/dir"); 77 | 78 | url = tlgs::Url("gemini://localhost/asd"); 79 | path = "/dir"; 80 | CHECK(tlgs::linkCompose(url, path).str() == "gemini://localhost/dir"); 81 | 82 | url = tlgs::Url("gemini://localhost/asd"); 83 | path = "dir"; 84 | CHECK(tlgs::linkCompose(url, path).str() == "gemini://localhost/dir"); 85 | 86 | url = tlgs::Url("gemini://localhost/asd/"); 87 | path = "dir"; 88 | CHECK(tlgs::linkCompose(url, path).str() == "gemini://localhost/asd/dir"); 89 | 90 | url = tlgs::Url("gemini://localhost/asd/"); 91 | path = "dir/"; 92 | CHECK(tlgs::linkCompose(url, path).str() == "gemini://localhost/asd/dir/"); 93 | 94 | url = tlgs::Url("gemini://localhost/asd#123456"); 95 | path = "dir/"; 96 | CHECK(tlgs::linkCompose(url, path).str() == "gemini://localhost/dir/"); 97 | 98 | url = tlgs::Url("gemini://localhost/asd#123456"); 99 | path = "dir#789"; 100 | CHECK(tlgs::linkCompose(url, path).str() == "gemini://localhost/dir#789"); 101 | 102 | url = tlgs::Url("gemini://localhost/asd/zxc"); 103 | path = "../dir"; 104 | CHECK(tlgs::linkCompose(url, path).str() == "gemini://localhost/dir"); 105 | 106 | url = tlgs::Url("gemini://localhost/asd?123"); 107 | path = "dir"; 108 | CHECK(tlgs::linkCompose(url, path).str() == "gemini://localhost/dir"); 109 | 110 | url = tlgs::Url("gemini://localhost/asd"); 111 | path = "dir?123"; 112 | CHECK(tlgs::linkCompose(url, path).str() == "gemini://localhost/dir?123"); 113 | 114 | url = tlgs::Url("gemini://localhost/asd#123"); 115 | path = "dir?789"; 116 | CHECK(tlgs::linkCompose(url, path).str() == "gemini://localhost/dir?789"); 117 | 118 | url = tlgs::Url("gemini://127.0.0.1/asd#123"); 119 | path = "dir?789"; 120 | CHECK(tlgs::linkCompose(url, path).str() == "gemini://127.0.0.1/dir?789"); 121 | 122 | url = tlgs::Url("gemini://localhost/test/one.gmi"); 123 | path = "../two.gmi"; 124 | CHECK(tlgs::linkCompose(url, path).str() == "gemini://localhost/two.gmi"); 125 | 126 | url = tlgs::Url("gemini://dhammapada.michaelnordmeyer.com/chapter-3/33.gmi"); 127 | path = "../chapter-2/32.gmi"; 128 | CHECK(tlgs::linkCompose(url, path).str() == "gemini://dhammapada.michaelnordmeyer.com/chapter-2/32.gmi"); 129 | } 130 | 131 | DROGON_TEST(NonUriActionTest) 132 | { 133 | CHECK(tlgs::isNonUriAction("javascript:void(2)") == true); 134 | CHECK(tlgs::isNonUriAction("mailto:tom@example.com") == true); 135 | CHECK(tlgs::isNonUriAction("gemini://localhost") == false); 136 | } 137 | 138 | DROGON_TEST(PgSQLEscape) 139 | { 140 | CHECK(tlgs::pgSQLRealEscape("test") == "test"); 141 | CHECK(tlgs::pgSQLRealEscape("test'") == "test''"); 142 | CHECK(tlgs::pgSQLRealEscape("test''") == "test''''"); 143 | CHECK(tlgs::pgSQLRealEscape("\n") == "\\n"); 144 | } 145 | 146 | DROGON_TEST(XXHashTest) 147 | { 148 | CHECK(tlgs::xxHash64("Hello, World!") == "C49AACF8080FE47F"); 149 | } 150 | -------------------------------------------------------------------------------- /tlgs/server/controllers/api.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "search_result.hpp" 9 | 10 | using namespace drogon; 11 | 12 | 13 | namespace api 14 | { 15 | struct v1 : public HttpController 16 | { 17 | public: 18 | Task known_hosts(HttpRequestPtr req); 19 | Task known_feeds(HttpRequestPtr req); 20 | Task known_security_txt(HttpRequestPtr req); 21 | Task known_perma_redirects(HttpRequestPtr req); 22 | 23 | METHOD_LIST_BEGIN 24 | METHOD_ADD(v1::known_hosts, "/known_hosts", {Get}); 25 | METHOD_ADD(v1::known_feeds, "/known_feeds", {Get}); 26 | METHOD_ADD(v1::known_security_txt, "/known_security_txt", {Get}); 27 | METHOD_ADD(v1::known_perma_redirects, "/known_perma_redirects", {Get}); 28 | METHOD_LIST_END 29 | }; 30 | } 31 | 32 | static CacheMap> cache(app().getLoop(), 600); 33 | 34 | Task api::v1::known_hosts(HttpRequestPtr req) 35 | { 36 | std::shared_ptr hosts; 37 | if(cache.findAndFetch("hosts", hosts) == false ) { 38 | auto db = app().getDbClient(); 39 | auto known_hosts = co_await db->execSqlCoro("SELECT DISTINCT domain_name, port FROM pages"); 40 | auto hosts_vector = std::vector(); 41 | hosts_vector.reserve(known_hosts.size()); 42 | for(const auto& host : known_hosts) { 43 | auto host_name = host["domain_name"].as(); 44 | auto port = host["port"].as(); 45 | hosts_vector.push_back(tlgs::Url("gemini://"+host_name+":"+std::to_string(port)+"/").str()); 46 | } 47 | hosts = std::make_shared(hosts_vector); 48 | cache.insert("hosts", hosts, 3600*8); 49 | } 50 | 51 | co_await sleepCoro(app().getLoop(), 0.75); 52 | 53 | auto resp = HttpResponse::newHttpResponse(); 54 | resp->setBody(hosts->dump()); 55 | resp->setContentTypeCode(CT_APPLICATION_JSON); 56 | co_return resp; 57 | } 58 | 59 | Task api::v1::known_feeds(HttpRequestPtr req) 60 | { 61 | const static std::array allowed_type = {"atom", "rss", "twtxt", "gemsub"}; 62 | auto request_feed_type = req->getParameter("query"); 63 | if(request_feed_type == "") 64 | request_feed_type = "atom"; 65 | 66 | if(std::find(allowed_type.begin(), allowed_type.end(), request_feed_type) == allowed_type.end()) 67 | throw std::invalid_argument("Invalid feed type"); 68 | 69 | std::shared_ptr feeds; 70 | if(cache.findAndFetch(request_feed_type, feeds) == false ) { 71 | auto db = app().getDbClient(); 72 | auto feeds_in_db = co_await db->execSqlCoro("SELECT url FROM pages WHERE feed_type = $1", request_feed_type); 73 | auto feeds_vector = tlgs::map(feeds_in_db, [](const auto& feed) { return feed["url"].template as(); }); 74 | feeds = std::make_shared(std::move(feeds_vector)); 75 | cache.insert(request_feed_type, feeds, 3600*8); 76 | } 77 | 78 | co_await sleepCoro(app().getLoop(), 0.75); 79 | 80 | auto resp = HttpResponse::newHttpResponse(); 81 | resp->setBody(feeds->dump()); 82 | resp->setContentTypeCode(CT_APPLICATION_JSON); 83 | co_return resp; 84 | } 85 | 86 | Task api::v1::known_security_txt(HttpRequestPtr req) 87 | { 88 | std::shared_ptr security_txt; 89 | if(cache.findAndFetch("security_txt", security_txt) == false ) { 90 | auto db = app().getDbClient(); 91 | auto known_security_txt = co_await db->execSqlCoro("SELECT url FROM pages WHERE " 92 | "content_type = 'text/plain' AND url ~ '.*://[^\\/]+/.well-known/security.txt'"); 93 | auto security_txt_vector = tlgs::map(known_security_txt, [](const auto& security_txt) { 94 | return security_txt["url"].template as(); 95 | }); 96 | security_txt_vector.reserve(known_security_txt.size()); 97 | security_txt = std::make_shared(security_txt_vector); 98 | cache.insert("security_txt", security_txt, 3600*8); 99 | } 100 | co_await sleepCoro(app().getLoop(), 0.75); 101 | auto resp = HttpResponse::newHttpResponse(); 102 | resp->setBody(security_txt->dump()); 103 | resp->setContentTypeCode(CT_APPLICATION_JSON); 104 | co_return resp; 105 | } 106 | 107 | Task api::v1::known_perma_redirects(HttpRequestPtr req) 108 | { 109 | std::shared_ptr perma_redirects; 110 | if(cache.findAndFetch("perma_redirects", perma_redirects) == false ) { 111 | auto db = app().getDbClient(); 112 | auto known_perma_redirects = co_await db->execSqlCoro("SELECT from_url, to_url FROM perma_redirects"); 113 | perma_redirects = std::make_shared(); 114 | for(auto& redirect : known_perma_redirects) { 115 | perma_redirects->push_back({ 116 | {"from_url", redirect["from_url"].template as()}, 117 | {"to_url", redirect["to_url"].template as()} 118 | }); 119 | } 120 | cache.insert("perma_redirects", perma_redirects, 3600*8); 121 | } 122 | co_await sleepCoro(app().getLoop(), 0.75); 123 | auto resp = HttpResponse::newHttpResponse(); 124 | resp->setBody(perma_redirects->dump()); 125 | resp->setContentTypeCode(CT_APPLICATION_JSON); 126 | co_return resp; 127 | } -------------------------------------------------------------------------------- /tlgs/server/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #ifdef __linux__ 7 | #define LLUNVEIL_USE_UNVEIL 8 | #include 9 | #endif 10 | 11 | #include "search_result.hpp" 12 | 13 | #include "CLI/App.hpp" 14 | #include "CLI/Formatter.hpp" 15 | #include "CLI/Config.hpp" 16 | 17 | using namespace drogon; 18 | using namespace dremini; 19 | 20 | 21 | int main(int argc, char** argv) 22 | { 23 | app().setLogLevel(trantor::Logger::LogLevel::kTrace); 24 | app().registerHandler("/", 25 | [](const HttpRequestPtr& req, 26 | std::function &&callback) 27 | { 28 | auto resp = HttpResponse::newHttpViewResponse("homepage"); 29 | resp->setContentTypeCodeAndCustomString(CT_CUSTOM, "text/gemini"); 30 | callback(resp); 31 | }); 32 | app().registerHandler("/about", 33 | [](const HttpRequestPtr& req, 34 | std::function &&callback) 35 | { 36 | HttpViewData data; 37 | data["title"] = std::string("About TLGS"); 38 | auto resp = HttpResponse::newHttpViewResponse("about", data); 39 | resp->setContentTypeCodeAndCustomString(CT_CUSTOM, "text/gemini"); 40 | callback(resp); 41 | }); 42 | app().registerHandler("/news", 43 | [](const HttpRequestPtr& req, 44 | std::function &&callback) 45 | { 46 | HttpViewData data; 47 | data["title"] = std::string("TLGS News and Updates"); 48 | auto resp = HttpResponse::newHttpViewResponse("news", data); 49 | resp->setContentTypeCodeAndCustomString(CT_CUSTOM, "text/gemini"); 50 | callback(resp); 51 | }); 52 | app().registerHandler("/doc/search", 53 | [](const HttpRequestPtr& req, 54 | std::function &&callback) 55 | { 56 | HttpViewData data; 57 | data["title"] = std::string("TLGS Search Documentation"); 58 | auto resp = HttpResponse::newHttpViewResponse("doc_search", data); 59 | resp->setContentTypeCodeAndCustomString(CT_CUSTOM, "text/gemini"); 60 | callback(resp); 61 | }); 62 | app().registerHandler("/doc/index", 63 | [](const HttpRequestPtr& req, 64 | std::function &&callback) 65 | { 66 | HttpViewData data; 67 | data["title"] = std::string("TLGS Index Documentation"); 68 | auto resp = HttpResponse::newHttpViewResponse("doc_indexing", data); 69 | resp->setContentTypeCodeAndCustomString(CT_CUSTOM, "text/gemini"); 70 | callback(resp); 71 | }); 72 | app().registerHandler("/doc/api", 73 | [](const HttpRequestPtr& req, 74 | std::function &&callback) 75 | { 76 | HttpViewData data; 77 | data["title"] = std::string("TLGS API Documentation"); 78 | auto resp = HttpResponse::newHttpViewResponse("doc_api", data); 79 | resp->setContentTypeCodeAndCustomString(CT_CUSTOM, "text/gemini"); 80 | callback(resp); 81 | }); 82 | app().registerHandler("/doc/backlinks", 83 | [](const HttpRequestPtr& req, 84 | std::function &&callback) 85 | { 86 | HttpViewData data; 87 | data["title"] = std::string("TLGS Backlinks Documentation"); 88 | auto resp = HttpResponse::newHttpViewResponse("doc_backlinks", data); 89 | resp->setContentTypeCodeAndCustomString(CT_CUSTOM, "text/gemini"); 90 | callback(resp); 91 | }); 92 | 93 | app().getLoop()->queueInLoop([](){ 94 | #if defined(__linux__) || defined(__OpenBSD__) 95 | // Lockdown the server to only access the files in the document directory 96 | unveil(drogon::app().getDocumentRoot().c_str(), "r"); 97 | unveil(drogon::app().getUploadPath().c_str(), "rwc"); 98 | unveil(nullptr, nullptr); 99 | #endif 100 | }); 101 | 102 | // HACK: Replace Gemini links that needs user input with =: 103 | app().registerPostHandlingAdvice([](const HttpRequestPtr& req, const HttpResponsePtr& resp) { 104 | if(req->getHeader("protocol") == "spartan" && resp->contentTypeString() == "text/gemini") { 105 | std::stringstream ss; 106 | ss << resp->getBody(); 107 | 108 | std::string result; 109 | std::string line; 110 | while(std::getline(ss, line)) { 111 | if(line.starts_with("=>") && 112 | (line.find("/search") != std::string::npos || line.find("/backlinks") != std::string::npos 113 | || line.find("/add_seed") != std::string::npos) && line.find("/doc") == std::string::npos 114 | && line.find("?") == std::string::npos) { 115 | utils::replaceAll(line, "=>", "=:"); 116 | } 117 | result += line + "\n"; 118 | } 119 | resp->setBody(result); 120 | } 121 | }); 122 | 123 | CLI::App cli{"TLGS server"}; 124 | std::string config_file = "/etc/tlgs/server_config.json"; 125 | cli.add_option("config_file", config_file, "Path to TLGS config file"); 126 | CLI11_PARSE(cli, argc, argv); 127 | LOG_INFO << "Loading config from " << config_file; 128 | app().loadConfigFile(config_file); 129 | 130 | app().run(); 131 | } 132 | -------------------------------------------------------------------------------- /tlgs/server/controllers/tools.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "search_result.hpp" 6 | 7 | using namespace drogon; 8 | 9 | struct ToolsController : public HttpController 10 | { 11 | public: 12 | Task statistics(HttpRequestPtr req); 13 | Task known_hosts(HttpRequestPtr req); 14 | Task add_seed(HttpRequestPtr req); 15 | 16 | METHOD_LIST_BEGIN 17 | ADD_METHOD_TO(ToolsController::statistics, "/statistics", {Get}); 18 | ADD_METHOD_TO(ToolsController::known_hosts, "/known-hosts", {Get}); 19 | ADD_METHOD_TO(ToolsController::add_seed, "/add_seed", {Get}); 20 | METHOD_LIST_END 21 | }; 22 | 23 | Task ToolsController::statistics(HttpRequestPtr req) 24 | { 25 | static CacheMap> cache(app().getLoop(), 600); 26 | std::shared_ptr server_stat; 27 | if(cache.findAndFetch("server_stat", server_stat) == false) { 28 | auto db = app().getDbClient(); 29 | auto domain_pages = co_await db->execSqlCoro("SELECT COUNT(DISTINCT LOWER(domain_name)) as domain_count, COUNT(*) AS count " 30 | "FROM pages WHERE content_body IS NOT NULL"); 31 | auto content_type = co_await db->execSqlCoro("SELECT COUNT(content_type) AS COUNT, content_type FROM pages " 32 | "GROUP BY content_type ORDER BY count DESC;"); 33 | 34 | uint64_t page_count = domain_pages[0]["count"].as(); 35 | uint64_t domain_count = domain_pages[0]["domain_count"].as(); 36 | std::vector> content_type_count; 37 | content_type_count.reserve(content_type.size()); 38 | for(const auto& content : content_type) { 39 | if(content["content_type"].isNull() == true) 40 | continue; 41 | std::string type = content["content_type"].as(); 42 | if(type.empty()) 43 | type = ""; 44 | content_type_count.push_back({type, content["count"].as()}); 45 | } 46 | 47 | server_stat = std::make_shared(); 48 | server_stat->page_count = page_count; 49 | server_stat->domain_count = domain_count; 50 | server_stat->content_type_count = std::move(content_type_count); 51 | server_stat->update_time = trantor::Date::now().toCustomedFormattedString("%Y-%m-%d %H:%M:%S", false); 52 | cache.insert("server_stat", server_stat, 3600*6); 53 | } 54 | 55 | HttpViewData data; 56 | data["title"] = std::string("TLGS Statistics"); 57 | data["server_stat"] = server_stat; 58 | auto resp = HttpResponse::newHttpViewResponse("statistics", data); 59 | resp->setContentTypeCodeAndCustomString(CT_CUSTOM, "text/gemini"); 60 | co_return resp; 61 | } 62 | 63 | Task ToolsController::known_hosts(HttpRequestPtr req) 64 | { 65 | static CacheMap>> cache(app().getLoop(), 600); 66 | std::shared_ptr> hosts; 67 | if(cache.findAndFetch("hosts", hosts) == false ) { 68 | auto db = app().getDbClient(); 69 | auto known_hosts = co_await db->execSqlCoro("SELECT DISTINCT LOWER(domain_name) AS domain_name, port FROM pages"); 70 | hosts = std::make_shared>(); 71 | hosts->reserve(known_hosts.size()); 72 | for(const auto& host : known_hosts) { 73 | auto host_name = host["domain_name"].as(); 74 | auto port = host["port"].as(); 75 | auto url = tlgs::Url("gemini://"+host_name+":"+std::to_string(port)); 76 | if(url.good() == false) 77 | continue; 78 | hosts->push_back(url.str()); 79 | } 80 | cache.insert("hosts", hosts, 3600*8); 81 | } 82 | HttpViewData data; 83 | data["title"] = std::string("Hosts known to TLGS"); 84 | data["hosts"] = std::move(hosts); 85 | auto resp = HttpResponse::newHttpViewResponse("known_hosts", data); 86 | resp->setContentTypeCodeAndCustomString(CT_CUSTOM, "text/gemini"); 87 | co_return resp; 88 | } 89 | 90 | Task ToolsController::add_seed(HttpRequestPtr req) 91 | { 92 | auto input = utils::urlDecode(req->getParameter("query")); 93 | 94 | if(input.empty()) 95 | { 96 | auto resp = HttpResponse::newHttpResponse(); 97 | resp->addHeader("meta", "URL to your capsule"); 98 | resp->setStatusCode((HttpStatusCode)10); 99 | co_return resp; 100 | } 101 | 102 | auto url = tlgs::Url(input); 103 | if(url.good() == false || url.protocol() != "gemini") 104 | { 105 | auto resp = HttpResponse::newHttpResponse(); 106 | resp->addHeader("meta", "Invalid URL. Try again"); 107 | resp->setStatusCode((HttpStatusCode)10); 108 | co_return resp; 109 | } 110 | if(req->getHeader("protocol") != "gemini") 111 | { 112 | auto resp = HttpResponse::newHttpResponse(); 113 | resp->setBody("We only accept links directly through Gemini"); 114 | resp->setContentTypeCode(CT_TEXT_PLAIN); 115 | resp->setStatusCode(k403Forbidden); 116 | co_return resp; 117 | } 118 | 119 | // HACK: Force slow down 120 | co_await drogon::sleepCoro(app().getLoop(), 0.75); 121 | 122 | auto db = app().getDbClient(); 123 | co_await db->execSqlCoro("INSERT INTO pages (url, domain_name, port, first_seen_at) " 124 | "VALUES ($1, $2, $3, CURRENT_TIMESTAMP) ON CONFLICT DO NOTHING;", 125 | url.str(), url.host(), url.port(1965)); 126 | 127 | auto resp = HttpResponse::newHttpResponse(); 128 | resp->setBody("# Adding Capsule\nAdded " + input); 129 | resp->setContentTypeCodeAndCustomString(CT_CUSTOM, "text/gemini"); 130 | co_return resp; 131 | } -------------------------------------------------------------------------------- /tlgsutils/tests/robots_txt_parser_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | DROGON_TEST(RobotTextTest) 5 | { 6 | std::string robots = 7 | "User-agent: *\n" 8 | "Disallow: /\n"; 9 | 10 | auto disallowed = tlgs::parseRobotsTxt(robots, {"*"}); 11 | REQUIRE(disallowed.size() == 1); 12 | CHECK(disallowed[0] == "/"); 13 | 14 | robots = 15 | "User-agent: gus\n" 16 | "Disallow: /\n"; 17 | disallowed = tlgs::parseRobotsTxt(robots, {"tlgs"}); 18 | REQUIRE(disallowed.size() == 0); 19 | 20 | robots = 21 | "User-agent: gus\n" 22 | "Disallow: /\n" 23 | "\n" 24 | "User-agent: tlgs\n" 25 | "Disallow: /mydir"; 26 | disallowed = tlgs::parseRobotsTxt(robots, {"tlgs"}); 27 | REQUIRE(disallowed.size() == 1); 28 | CHECK(disallowed[0] == "/mydir"); 29 | 30 | robots = 31 | "User-agent: gus\n" 32 | "User-agent: tlgs\n" 33 | "Disallow: /\n"; 34 | disallowed = tlgs::parseRobotsTxt(robots, {"tlgs"}); 35 | REQUIRE(disallowed.size() == 1); 36 | CHECK(disallowed[0] == "/"); 37 | disallowed = tlgs::parseRobotsTxt(robots, {"gus"}); 38 | REQUIRE(disallowed.size() == 1); 39 | CHECK(disallowed[0] == "/"); 40 | 41 | robots = 42 | "User-agent: *\n" 43 | "Disallow: /\n" 44 | "\n" 45 | "User-agent: tlgs\n" 46 | "Disallow: \n"; 47 | disallowed = tlgs::parseRobotsTxt(robots, {"tlgs", "*"}); 48 | REQUIRE(disallowed.size() == 0); 49 | disallowed = tlgs::parseRobotsTxt(robots, {"*"}); 50 | REQUIRE(disallowed.size() == 1); 51 | CHECK(disallowed[0] == "/"); 52 | 53 | robots = 54 | "User-agent: *\n" 55 | "Disallow: /\n" 56 | "\n" 57 | "User-agent: tlgs\n" 58 | "Disallow: \n"; 59 | disallowed = tlgs::parseRobotsTxt(robots, {"tlgs", "*"}); 60 | REQUIRE(disallowed.size() == 0); 61 | 62 | robots = ""; 63 | disallowed = tlgs::parseRobotsTxt(robots, {"tlgs", "*"}); 64 | REQUIRE(disallowed.size() == 0); 65 | 66 | robots = 67 | "User-agent: indexer\n" 68 | "Disallow: /test\n" 69 | "User-agent: researcher\n" 70 | "Disallow: /\n"; 71 | disallowed = tlgs::parseRobotsTxt(robots, {"indexer", "*"}); 72 | REQUIRE(disallowed.size() == 1); 73 | 74 | // Test support for lowercase user-agent 75 | robots = 76 | "user-agent: indexer\n" 77 | "Disallow: /test\n"; 78 | disallowed = tlgs::parseRobotsTxt(robots, {"indexer", "*"}); 79 | REQUIRE(disallowed.size() == 1); 80 | 81 | // Test support for lowercase disallow 82 | robots = 83 | "User-agent: indexer\n" 84 | "disallow: /test\n"; 85 | disallowed = tlgs::parseRobotsTxt(robots, {"indexer", "*"}); 86 | REQUIRE(disallowed.size() == 1); 87 | 88 | // Test support for weird cases 89 | robots = 90 | "User-AGEnT: indexer\n" 91 | "disalloW: /test\n"; 92 | disallowed = tlgs::parseRobotsTxt(robots, {"indexer", "*"}); 93 | REQUIRE(disallowed.size() == 1); 94 | 95 | // Test handling of leading whitespace in value 96 | robots = 97 | "User-agent: \tindexer\n" 98 | "Disallow: /test\n"; 99 | disallowed = tlgs::parseRobotsTxt(robots, {"indexer", "*"}); 100 | REQUIRE(disallowed.size() == 1); 101 | REQUIRE(disallowed[0] == "/test"); 102 | 103 | // Test handling of leading whitespace in key 104 | robots = 105 | " User-agent: indexer\n" 106 | " Disallow: /test\n"; 107 | disallowed = tlgs::parseRobotsTxt(robots, {"indexer", "*"}); 108 | REQUIRE(disallowed.size() == 1); 109 | REQUIRE(disallowed[0] == "/test"); 110 | 111 | // Test CRLF line endings 112 | robots = 113 | "User-agent: indexer\r\n" 114 | "Disallow: /test\r\n"; 115 | disallowed = tlgs::parseRobotsTxt(robots, {"indexer", "*"}); 116 | REQUIRE(disallowed.size() == 1); 117 | REQUIRE(disallowed[0] == "/test"); 118 | 119 | // Default user-agent should be * 120 | robots = 121 | "Disallow: /foo\n"; 122 | disallowed = tlgs::parseRobotsTxt(robots, {"indexer"}); 123 | REQUIRE(disallowed.size() == 1); 124 | CHECK(disallowed[0] == "/foo"); 125 | } 126 | 127 | DROGON_TEST(BlockedPathTest) 128 | { 129 | CHECK(tlgs::isPathBlocked("/", "/") == true); 130 | CHECK(tlgs::isPathBlocked("/foo", "/") == true); 131 | CHECK(tlgs::isPathBlocked("/bar", "/foo") == false); 132 | CHECK(tlgs::isPathBlocked("/foo", "/foobar") == false); 133 | CHECK(tlgs::isPathBlocked("/foo", "/foo/") == false); 134 | CHECK(tlgs::isPathBlocked("/foo/", "/foo") == true); 135 | CHECK(tlgs::isPathBlocked("/foo/bar/", "/foo") == true); 136 | CHECK(tlgs::isPathBlocked("/foo/", "/foo/bar") == false); 137 | CHECK(tlgs::isPathBlocked("/foo.txt", "/foo") == false); 138 | CHECK(tlgs::isPathBlocked("/foo/bar.txt", "/foo") == true); 139 | CHECK(tlgs::isPathBlocked("/foo/bar.txt", "/foo/*") == true); 140 | CHECK(tlgs::isPathBlocked("/foo/bar.txt", "*.txt") == true); 141 | CHECK(tlgs::isPathBlocked("/foo/bar.txt", "*.ogg") == false); 142 | CHECK(tlgs::isPathBlocked("/foo/dir1/bar.txt", "*.txt") == true); 143 | CHECK(tlgs::isPathBlocked("/foo/dir1/bar.txt", "*.txt$") == true); 144 | CHECK(tlgs::isPathBlocked("/foo/some_dir/bar.txt", "*some_dir*") == true); 145 | CHECK(tlgs::isPathBlocked("/foo/other_dir/bar.txt", "*some_dir*") == false); 146 | CHECK(tlgs::isPathBlocked("/foo/other_dir/baz/bar.txt", "/foo/*/baz") == true); 147 | CHECK(tlgs::isPathBlocked("/~testuser/gci-bin/test.txt", "/~*/cgi-bin/") == true); 148 | CHECK(tlgs::isPathBlocked("/foo/123/bar/456/baz", "/foo/*/bar/*/baz") == true); 149 | CHECK(tlgs::isPathBlocked("/foo/123/bar/baz", "/foo/*/bar/*/baz") == false); 150 | CHECK(tlgs::isPathBlocked("/foo/123/bar/baz", "/foo/*/bar/*") == true); 151 | CHECK(tlgs::isPathBlocked("/foo", "/***") == true); 152 | 153 | // Check special regex characters are escaped 154 | CHECK(tlgs::isPathBlocked("/foo/(", "/foo/(") == true); 155 | CHECK(tlgs::isPathBlocked("/*/asd/*/.mp3", "/foo/asd/bar/1mp3") == false); 156 | CHECK(tlgs::isPathBlocked("/foo/\\*", "/foo/*") == true); 157 | } 158 | -------------------------------------------------------------------------------- /tlgsutils/gemini_parser.cpp: -------------------------------------------------------------------------------- 1 | #include "gemini_parser.hpp" 2 | #include "utils.hpp" 3 | #include "url_parser.hpp" 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace tlgs 10 | { 11 | GeminiDocument extractGemini(const std::string_view sv) 12 | { 13 | return extractGemini(dremini::parseGemini(sv)); 14 | } 15 | 16 | GeminiDocument extractGemini(const std::vector& nodes) 17 | { 18 | GeminiDocument doc; 19 | doc.text.reserve(1024); 20 | for(const auto& node : nodes) { 21 | doc.text += node.text + "\n"; 22 | if(node.type == "link") 23 | doc.links.push_back(node.meta); 24 | else if(node.type == "heading1" && doc.title.empty()) 25 | doc.title = node.text; 26 | } 27 | 28 | return doc; 29 | } 30 | 31 | GeminiDocument extractGeminiConcise(const std::string_view sv) 32 | { 33 | return extractGeminiConcise(dremini::parseGemini(sv)); 34 | } 35 | 36 | GeminiDocument extractGeminiConcise(const std::vector& nodes) 37 | { 38 | // TODO: Optimize the function 39 | GeminiDocument doc; 40 | bool first_content = true; 41 | for(const auto& node : nodes) { 42 | // Avoid indexing ASCII art. This may remove code blocks. But it shoudn't matter 43 | if(node.type == "preformatted_text") { 44 | // Usually if a preformatted text block is the first content, it's ASCII art 45 | if(first_content) 46 | continue; 47 | 48 | std::string meta = node.meta; 49 | // common text in the meta field that contains ASCII art 50 | std::transform(meta.begin(), meta.end(), meta.begin(), ::tolower); 51 | if(meta.find("ascii") != std::string::npos 52 | || meta.find("art") != std::string::npos 53 | || meta.find("banner") != std::string::npos 54 | || meta.find("logo") != std::string::npos 55 | || meta.find("title") != std::string::npos 56 | || meta.find("news") != std::string::npos 57 | || meta.find("capsule") != std::string::npos 58 | || meta.find("user") != std::string::npos 59 | || meta.find("image") != std::string::npos 60 | || meta.find("the") != std::string::npos 61 | || meta.find("graphics") != std::string::npos 62 | || meta.find("world") != std::string::npos) 63 | continue; 64 | 65 | if(tlgs::isAsciiArt(node.text) == true) 66 | continue; 67 | } 68 | // Avoid paragraph seperators 69 | // TODO: Handle unicode paragraph seperators 70 | else if(node.type == "text" && !node.text.empty()){ 71 | first_content = false; 72 | 73 | // The entire line is made of the same character 74 | char first = node.text[0]; 75 | bool is_all_same = node.text.find_first_not_of(first) == std::string::npos; 76 | if(is_all_same) 77 | continue; 78 | // The first character is repeated 3 times and the last is also repeated 3 times 79 | // Ex: ----- next up ------ 80 | char last = node.text.back(); 81 | if(node.text.size() > 6 && node.text.substr(0, 3).find_first_not_of(first) == std::string::npos 82 | && node.text.substr(node.text.size() - 3).find_first_not_of(last) == std::string::npos 83 | && first != ' ' && last != ' ' && first != '\t' && last != '\t') 84 | continue; 85 | 86 | // avoid people posting output of `tree` without formatting 87 | if(node.text.find("│") < 3) 88 | continue; 89 | } 90 | doc.text += node.text + "\n"; 91 | if(node.type == "link") { 92 | doc.links.push_back(node.meta); 93 | } 94 | else if(node.type == "heading1" && doc.title.empty()) 95 | doc.title = node.text; 96 | } 97 | return doc; 98 | } 99 | 100 | bool isGemsub(const std::vector& nodes) 101 | { 102 | size_t cont_dated_entries_counter = 0; 103 | size_t max_cont_dated_entries = 0; 104 | const std::regex date_re(R"([0-9]{4}-[0-9]{1,2}-[0-9]{1,2}.*)"); 105 | for(const auto& node : nodes) { 106 | std::smatch sm; 107 | if(node.type == "link") { 108 | if(std::regex_match(node.text, sm, date_re)) 109 | cont_dated_entries_counter++; 110 | else 111 | cont_dated_entries_counter = 0; 112 | 113 | max_cont_dated_entries = std::max(max_cont_dated_entries, cont_dated_entries_counter); 114 | } 115 | } 116 | return max_cont_dated_entries >= 3; 117 | } 118 | 119 | bool isGemsub(const std::vector& nodes, const tlgs::Url& feed_url, const std::string_view protocol) 120 | { 121 | size_t cont_dated_entries_counter = 0; 122 | size_t max_cont_dated_entries = 0; 123 | const std::regex date_re(R"([0-9]{4}-[0-9]{1,2}-[0-9]{1,2}.*)"); 124 | for(const auto& node : nodes) { 125 | if(node.type == "link") { 126 | std::smatch sm; 127 | if(std::regex_match(node.text, sm, date_re) == false || node.meta.empty()) { 128 | cont_dated_entries_counter = 0; 129 | continue; 130 | } 131 | 132 | tlgs::Url link = tlgs::Url(node.meta); 133 | std::string link_protocol; 134 | std::string link_host; 135 | if(link.good()) { 136 | // Empty protocol means using the same protocol as the feed 137 | link_protocol = link.protocol(); 138 | if(link_protocol.empty()) 139 | link_protocol = protocol; 140 | // empty host means using the same host as the feed (shouldn't happen??) 141 | link_host = link.host(); 142 | if(link_host.empty()) 143 | link_host = feed_url.host(); 144 | } 145 | else { 146 | // else it's local link 147 | link_protocol = protocol; 148 | link_host = feed_url.host(); 149 | } 150 | 151 | if((protocol.empty() || link_protocol == protocol) && link_host == feed_url.host()) 152 | cont_dated_entries_counter++; 153 | else 154 | cont_dated_entries_counter = 0; 155 | 156 | max_cont_dated_entries = std::max(max_cont_dated_entries, cont_dated_entries_counter); 157 | } 158 | } 159 | return max_cont_dated_entries >= 3; 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /tlgsutils/url_parser.cpp: -------------------------------------------------------------------------------- 1 | #include "url_parser.hpp" 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace tlgs; 7 | 8 | Url::Url(const std::string& str, bool normalize_url) 9 | { 10 | if(str.empty()) { 11 | good_ = false; 12 | return; 13 | } 14 | 15 | good_ = true; 16 | std::string_view sv = str; 17 | 18 | // Find protocol 19 | auto idx = sv.find("//"); 20 | if(idx == std::string_view::npos) { 21 | good_ = false; 22 | return; 23 | } 24 | else if(idx == 0) { 25 | // URLs with no protocol. example: //example.com 26 | // this indicates the URL points to the same protocol as the current one 27 | 28 | protocol_ = ""; 29 | } 30 | else { 31 | if(sv[idx-1] != ':') { 32 | good_ = false; 33 | return; 34 | } 35 | protocol_ = sv.substr(0, idx-1); 36 | if(protocol_.empty()) { 37 | good_ = false; 38 | return; 39 | } 40 | for(auto ch : protocol_) { 41 | if(isalnum(ch) == false) { 42 | good_ = false; 43 | return; 44 | } 45 | } 46 | default_port_ = protocolDefaultPort(protocol_); 47 | } 48 | 49 | // Find host 50 | sv = sv.substr(idx+2); 51 | idx = sv.find_first_of(":/"); 52 | host_ = sv.substr(0, idx); 53 | if(host_.empty() || host_[0] == '.') { 54 | good_ = false; 55 | return; 56 | } 57 | if(idx == std::string::npos) { 58 | path_ = "/"; 59 | if(normalize_url) 60 | normalize(); 61 | return; 62 | } 63 | // Find port 64 | if(sv[idx] == ':') { 65 | sv = sv.substr(idx+1); 66 | idx = sv.find("/"); 67 | auto port_sv = sv.substr(0, idx); 68 | if(port_sv.empty()) { 69 | good_ = false; 70 | return; 71 | } 72 | try { 73 | port_ = std::stoi(std::string(port_sv)); 74 | if(port_ <= 0 || port_ > 65535) { 75 | good_ = false; 76 | return; 77 | } 78 | } 79 | catch(...) { 80 | good_ = false; 81 | return; 82 | } 83 | if(idx != std::string_view::npos) 84 | sv = sv.substr(idx+1); 85 | else 86 | sv = std::string_view(); 87 | } 88 | else 89 | sv = sv.substr(idx+1); 90 | // Find path 91 | idx = sv.find_first_of("?#"); 92 | path_ = "/"+std::string(sv.substr(0, idx)); 93 | if(normalize_url) 94 | normalize(); 95 | if(idx == std::string_view::npos) 96 | return; 97 | // Find param 98 | if(sv[idx] == '?') { 99 | sv = sv.substr(idx+1); 100 | idx = sv.find('#'); 101 | param_ = sv.substr(0, idx); 102 | } 103 | 104 | // Find fragment 105 | if(idx == std::string_view::npos) 106 | return; 107 | sv = sv.substr(idx+1); 108 | fragment_ = sv; 109 | } 110 | 111 | std::string Url::str() const 112 | { 113 | if(!cache_.empty()) 114 | return cache_; 115 | 116 | std::string res = (protocol_.empty() ? std::string("//") : protocol_+"://")+host_; 117 | if(port_ != 0 && default_port_ != port_) 118 | res += ":"+std::to_string(port_); 119 | res += path_; 120 | if(!param_.empty()) 121 | res += "?"+param_; 122 | if(!fragment_.empty()) 123 | res += "#"+fragment_; 124 | cache_ = res; 125 | return res; 126 | } 127 | 128 | std::string Url::hostWithPort(unsigned short default_port) const 129 | { 130 | std::string res = host_; 131 | if(port_ == 0 && default_port != 0) 132 | res += ":" + std::to_string(default_port); 133 | else 134 | res += ":" + std::to_string(port_); 135 | return res; 136 | } 137 | 138 | Url& Url::withHost(const std::string& new_host) 139 | { 140 | cache_.clear(); 141 | host_ = new_host; 142 | return *this; 143 | } 144 | 145 | Url& Url::withPath(const std::string& new_path, bool normalize_path) 146 | { 147 | cache_.clear(); 148 | if(new_path.empty() || new_path.front() != '/') 149 | path_ = "/" + new_path; 150 | else 151 | path_ = new_path; 152 | 153 | if(normalize_path) 154 | path_ = std::filesystem::path(path_).lexically_normal().generic_string(); 155 | return *this; 156 | } 157 | 158 | Url& Url::withParam(const std::string& new_param) 159 | { 160 | cache_.clear(); 161 | param_ = new_param; 162 | return *this; 163 | } 164 | 165 | Url& Url::withProtocol(const std::string& new_protocol) 166 | { 167 | cache_.clear(); 168 | protocol_ = new_protocol; 169 | return *this; 170 | } 171 | 172 | Url& Url::withPort(unsigned short new_port) 173 | { 174 | cache_.clear(); 175 | port_ = new_port; 176 | return *this; 177 | } 178 | 179 | Url& Url::withPort() 180 | { 181 | cache_.clear(); 182 | port_ = 0; 183 | return *this; 184 | } 185 | 186 | Url& Url::withDefaultPort(unsigned short n) 187 | { 188 | cache_.clear(); 189 | default_port_ = n; 190 | return *this; 191 | } 192 | 193 | Url& Url::withFragment(const std::string& new_fragment) 194 | { 195 | cache_.clear(); 196 | fragment_ = new_fragment; 197 | return *this; 198 | } 199 | 200 | Url& Url::normalize() 201 | { 202 | cache_.clear(); 203 | std::transform(protocol_.begin(), protocol_.end(), protocol_.begin(), ::tolower); 204 | path_ = std::filesystem::path(path_).lexically_normal().generic_string(); 205 | std::transform(host_.begin(), host_.end(), host_.begin(), ::tolower); 206 | return *this; 207 | } 208 | 209 | int Url::port(int default_port) const 210 | { 211 | if(port_ != 0) 212 | return port_; 213 | else if(default_port != 0) 214 | return default_port; 215 | else 216 | return default_port_; 217 | } 218 | 219 | const std::string& Url::protocol() const 220 | { 221 | return protocol_; 222 | } 223 | 224 | const std::string& Url::host() const 225 | { 226 | return host_; 227 | } 228 | 229 | const std::string& Url::path() const 230 | { 231 | return path_; 232 | } 233 | 234 | const std::string& Url::param() const 235 | { 236 | return param_; 237 | } 238 | 239 | const std::string& Url::fragment() const 240 | { 241 | return fragment_; 242 | } 243 | 244 | 245 | int Url::protocolDefaultPort(const std::string_view& proto) 246 | { 247 | if(proto == "http") 248 | return 80; 249 | else if(proto == "https") 250 | return 443; 251 | else if(proto == "gemini") 252 | return 1965; 253 | else if(proto == "gopher") 254 | return 70; 255 | else if(proto == "ftp") 256 | return 21; 257 | else 258 | return 0; 259 | } 260 | -------------------------------------------------------------------------------- /tlgsutils/tests/gemini_parser_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | using namespace dremini; 5 | 6 | DROGON_TEST(GeminiParserBasics) 7 | { 8 | auto nodes = dremini::parseGemini("# Heading"); 9 | REQUIRE(nodes.size() == 1); 10 | CHECK(nodes[0].text == "Heading"); 11 | CHECK(nodes[0].type == "heading1"); 12 | CHECK(nodes[0].orig_text == "# Heading"); 13 | CHECK(nodes[0].meta == ""); 14 | 15 | nodes = dremini::parseGemini("## Heading2"); 16 | REQUIRE(nodes.size() == 1); 17 | CHECK(nodes[0].text == "Heading2"); 18 | CHECK(nodes[0].type == "heading2"); 19 | CHECK(nodes[0].orig_text == "## Heading2"); 20 | CHECK(nodes[0].meta == ""); 21 | 22 | nodes = dremini::parseGemini("### Heading3"); 23 | REQUIRE(nodes.size() == 1); 24 | CHECK(nodes[0].text == "Heading3"); 25 | CHECK(nodes[0].type == "heading3"); 26 | CHECK(nodes[0].orig_text == "### Heading3"); 27 | CHECK(nodes[0].meta == ""); 28 | 29 | nodes = dremini::parseGemini("=> gemini://example.com Example Link"); 30 | REQUIRE(nodes.size() == 1); 31 | CHECK(nodes[0].text == "Example Link"); 32 | CHECK(nodes[0].type == "link"); 33 | CHECK(nodes[0].meta == "gemini://example.com"); 34 | 35 | nodes = dremini::parseGemini("=> gemini://example.com"); 36 | REQUIRE(nodes.size() == 1); 37 | CHECK(nodes[0].text == ""); 38 | CHECK(nodes[0].type == "link"); 39 | CHECK(nodes[0].meta == "gemini://example.com"); 40 | 41 | nodes = dremini::parseGemini("=>gemini://example.com"); 42 | REQUIRE(nodes.size() == 1); 43 | CHECK(nodes[0].text == ""); 44 | CHECK(nodes[0].type == "link"); 45 | CHECK(nodes[0].meta == "gemini://example.com"); 46 | 47 | nodes = dremini::parseGemini("Hello World"); 48 | REQUIRE(nodes.size() == 1); 49 | CHECK(nodes[0].text == "Hello World"); 50 | CHECK(nodes[0].type == "text"); 51 | CHECK(nodes[0].meta == ""); 52 | 53 | nodes = dremini::parseGemini("```\nHello World\n```"); 54 | REQUIRE(nodes.size() == 1); 55 | CHECK(nodes[0].text == "Hello World\n"); 56 | CHECK(nodes[0].type == "preformatted_text"); 57 | CHECK(nodes[0].meta == ""); 58 | 59 | nodes = dremini::parseGemini("\n\n"); 60 | REQUIRE(nodes.size() == 2); 61 | CHECK(nodes[0].type == "text"); 62 | CHECK(nodes[1].type == "text"); 63 | CHECK(nodes[0].text == ""); 64 | CHECK(nodes[1].text == ""); 65 | } 66 | 67 | DROGON_TEST(GeminiParserLink) 68 | { 69 | auto nodes = dremini::parseGemini("=>gemini://example.com Example Link"); 70 | REQUIRE(nodes.size() == 1); 71 | CHECK(nodes[0].text == "Example Link"); 72 | CHECK(nodes[0].type == "link"); 73 | CHECK(nodes[0].meta == "gemini://example.com"); 74 | 75 | nodes = dremini::parseGemini("=> gemini://example.com Example Link"); 76 | REQUIRE(nodes.size() == 1); 77 | CHECK(nodes[0].text == "Example Link"); 78 | CHECK(nodes[0].type == "link"); 79 | CHECK(nodes[0].meta == "gemini://example.com"); 80 | 81 | nodes = dremini::parseGemini("=> gemini://example.com Example Link"); 82 | REQUIRE(nodes.size() == 1); 83 | CHECK(nodes[0].text == "Example Link"); 84 | CHECK(nodes[0].type == "link"); 85 | CHECK(nodes[0].meta == "gemini://example.com"); 86 | 87 | nodes = dremini::parseGemini("=>gemini://example.com"); 88 | REQUIRE(nodes.size() == 1); 89 | CHECK(nodes[0].text == ""); 90 | CHECK(nodes[0].type == "link"); 91 | CHECK(nodes[0].meta == "gemini://example.com"); 92 | 93 | nodes = dremini::parseGemini("=>\tgemini://example.com"); 94 | REQUIRE(nodes.size() == 1); 95 | CHECK(nodes[0].text == ""); 96 | CHECK(nodes[0].type == "link"); 97 | CHECK(nodes[0].meta == "gemini://example.com"); 98 | 99 | nodes = dremini::parseGemini("=>\tgemini://example.com\t\tHello"); 100 | REQUIRE(nodes.size() == 1); 101 | CHECK(nodes[0].text == "Hello"); 102 | CHECK(nodes[0].type == "link"); 103 | CHECK(nodes[0].meta == "gemini://example.com"); 104 | 105 | nodes = dremini::parseGemini("=>\tgemini://example.com\t\tHello "); 106 | REQUIRE(nodes.size() == 1); 107 | CHECK(nodes[0].text == "Hello"); 108 | CHECK(nodes[0].type == "link"); 109 | CHECK(nodes[0].meta == "gemini://example.com"); 110 | } 111 | 112 | DROGON_TEST(GeminiParserArticle) 113 | { 114 | auto art1 = 115 | "# This is a sample article\n" 116 | "Hello world\n" 117 | "* list\n" 118 | "\n"; 119 | auto nodes = dremini::parseGemini(art1); 120 | REQUIRE(nodes.size() == 4); 121 | 122 | } 123 | 124 | DROGON_TEST(IsGemsub) 125 | { 126 | // gemlog index 127 | auto nodes = dremini::parseGemini(R"( 128 | # This is a sample article 129 | Hello world 130 | 131 | => /index1.gmi 2022-01-01 First article 132 | => /index2.gmi 2022-01-02 Second article 133 | => /index2.gmi 2022-01-03 Third article 134 | )"); 135 | CHECK(tlgs::isGemsub(nodes) == true); 136 | 137 | // Random text 138 | nodes = dremini::parseGemini(R"( 139 | # First First 140 | Hello world 141 | )"); 142 | CHECK(tlgs::isGemsub(nodes) == false); 143 | 144 | // index, but not gemlog index 145 | nodes = dremini::parseGemini(R"( 146 | # First First 147 | Hello world 148 | 149 | => / home 150 | => /food food 151 | => /cars cars 152 | )"); 153 | CHECK(tlgs::isGemsub(nodes) == false); 154 | 155 | // index, but not to gemini. Sttll considered gemsub 156 | nodes = dremini::parseGemini(R"( 157 | # First First 158 | Hello world 159 | 160 | => https://example.com/test1.html 2022-01-01 First article 161 | => https://example.com/test2.html 2022-01-02 Second article 162 | => https://example.com/test3.html 2022-01-03 Third article 163 | )"); 164 | CHECK(tlgs::isGemsub(nodes) == true); 165 | 166 | // index, relative links 167 | nodes = dremini::parseGemini(R"( 168 | # First First 169 | Hello world 170 | 171 | => /test1.html 2022-01-01 First article 172 | => /test2.html 2022-01-02 Second article 173 | => /test3.html 2022-01-03 Third article 174 | )"); 175 | CHECK(tlgs::isGemsub(nodes) == true); 176 | 177 | 178 | // index, but now we card about if the file is hosted on the same host 179 | nodes = dremini::parseGemini(R"( 180 | # First First 181 | Hello world 182 | 183 | => https://example.com/test1.html 2022-01-01 First article 184 | => https://example.com/test2.html 2022-01-02 Second article 185 | => https://example.com/test3.html 2022-01-03 Third article 186 | )"); 187 | CHECK(tlgs::isGemsub(nodes, tlgs::Url("https://example.org")) == false); 188 | 189 | // require gemini feed 190 | nodes = dremini::parseGemini(R"( 191 | # First First 192 | Hello world 193 | 194 | => https://example.com/test1.html 2022-01-01 First article 195 | => https://example.com/test2.html 2022-01-02 Second article 196 | => https://example.com/test3.html 2022-01-03 Third article 197 | )"); 198 | CHECK(tlgs::isGemsub(nodes, tlgs::Url("https://example.org"), "gemini") == false); 199 | 200 | // require gemini feed 201 | nodes = dremini::parseGemini(R"( 202 | # First First 203 | Hello world 204 | 205 | => gemini://example.com/test1.html 2022-01-01 First article 206 | => gemini://example.com/test2.html 2022-01-02 Second article 207 | => gemini://example.com/test3.html 2022-01-03 Third article 208 | )"); 209 | CHECK(tlgs::isGemsub(nodes, tlgs::Url("https://example.com"), "gemini") == true); 210 | 211 | // index, links on same host 212 | nodes = dremini::parseGemini(R"( 213 | # First First 214 | Hello world 215 | 216 | => /test1.html 2022-01-01 First article 217 | => /test2.html 2022-01-02 Second article 218 | => /test3.html 2022-01-03 Third article 219 | )"); 220 | CHECK(tlgs::isGemsub(nodes, tlgs::Url("https://example.com"), "gemini") == true); 221 | 222 | // index, relative links 223 | nodes = dremini::parseGemini(R"( 224 | # First First 225 | Hello world 226 | 227 | => test1.html 2022-01-01 First article 228 | => test2.html 2022-01-02 Second article 229 | => test3.html 2022-01-03 Third article 230 | )"); 231 | CHECK(tlgs::isGemsub(nodes, tlgs::Url("https://example.com"), "gemini") == true); 232 | } 233 | -------------------------------------------------------------------------------- /tlgs/server/templates/news.csp: -------------------------------------------------------------------------------- 1 | <%view header %> 2 | 3 | ### 2025-01-03 Crawler improvements 4 | 5 | The crawler uses to OOM postgres for no apparent reason. I've added a few more checks and eventually figured some malformed Gemtext is causing the indexer to go haywire. I've fixed the issue and ran several full crawls without any issues. We are back on stable ground. 6 | 7 | ### 2024-10-06 Crawler improvements 8 | 9 | It has been a while since I did any update to the search engine. This time 2 minor chages to the crawler: 10 | 11 | * Now permanent redirects (Gemini status code 31) are stored in the database. This should help reducing the number of requests to your capsule 12 | * Better handling of malformed response headers. 13 | 14 | ### 2023-07-14 Adding intitle: search filter 15 | 16 | I saw gemi.dev's post about Kennedy adding intitle: and site: modifiers. TLGS already supported the domain: filer. Intitle is a good idea so I added it. Now you can search for `intitle:gemini` to find all pages with gemini in the title. 17 | 18 | => gemini://gemi.dev/gemlog/2023-07-12-more-search-options.gmi gemi.dev - Better Gemini searching with intitle: and site: modifiers 19 | 20 | For example 21 | 22 | => gemini://tlgs.one/search?intitle:meow 23 | 24 | ### 2023-06-18 XSS Vulnerability fix when displaying gemtext in HTML 25 | 26 | I realized that gemtext can contain HTML tags and I wasn't escaping them when rendering. This is now fixed. 27 | 28 | ### 2023-03-20 Caching enhencements, trying Botan as TLS backend 29 | 30 | TLGS wasn't caching filtered search results as filtering was a post processing step and is relatively cheap. So TLGS just cached the actual search and filter as requests come. However yesterday I saw a rogue bot that was hammering the search API with the same query and used up significant amount of CPU. So I decided to cache the filtered results as well. This should make them have less impact on the server. 31 | 32 | Also switched the TLS backend from OpenSSL to Botan. Just to cross check the TLS adapter. This does limit the TLS version to 1.2 thus higher latency. Please bear with me for a while. Will switch back to OpenSSL once I'm confident that stuff is stable and correct. 33 | 34 | ### 2023-03-19 TLS upgrade is stable 35 | 36 | I'm happy to report that the TLS upgrade is stable after some patching. Enjoy the faster responses. 37 | 38 | ### 2023-03-05 Experimental TLS capability upgrade 39 | 40 | Sorry for disappearing for a while. I've been busy upgrading the TLS capability of TLGS. It's a massive endeavor that involved rewriting a hugh chunk of the networking stack. Now it's stable enough that I am confident to open it to the public. However, it's still experimental. So please report any issues you encounter. Something you might notice (and more to come): 41 | 42 | * Supports TLS session resumption, faster page response and faster crawls 43 | * Behaves well when unexpected TLS alerts are received 44 | * Disabled weak ciphersuites 45 | 46 | ### 2023-01-08 Reverting CJK support 47 | 48 | I'm reverting the CJK support for now. It's causing SQL timeouts and no clear indication why. 49 | 50 | ### 2022-11-09 Gemsub feed aggregation 51 | 52 | In addition to supporting twtxt, Atom and RSS feeds. TLGS now scans for Gemsub feeds while crawling. I'm hoping to build on this capability for a large scale feed aggregation service. 53 | 54 | ### 2022-10-28 CJK support 55 | 56 | Switched FTS indexing algorithm from PostgreSQL's internal one to pgroonga. This enables CJK support without too much performance drag. I may, might switch to ElasticSearch or Manticore in the future. But that's for now. 57 | 58 | ### 2022-10-9 Crawler fixed 59 | 60 | Happy to report that the FD leak is solved! Also optimized how DNS clients are created for lower CPU use by crawler. 61 | 62 | ### 2022-9-25 63 | 64 | Prevent edge case where the crawler tries to crawl localhost. Updated blacklist. I seems to be closer to resolving a file descriptor issue that leads to crawler hang. 65 | 66 | ### 2022-7-24 67 | 68 | Sorry for the few hours of downtime. Upgrading PostgreSQL gone horribly wrong. I had to ditch the entire database and start a completely clean one. Btw, search has been improved some what. Fixed a bug in title matching. For example, directly searching "Cosmos" will bring up the cosmos aggregator. 69 | 70 | ### 2022-7-19 71 | 72 | More updates are comming! Added emoji to links. And a new WIP API to query known security.txt files. Next I'll be working on improving search result and responsiveness. 73 | 74 | ### 2022-7-16 75 | 76 | Added a new API `/api/v1/known_feeds` to allow querying feeds known to TLGS. See the API documentation for more information. 77 | 78 | ### 2022-6-6 79 | 80 | Sorry for the ~15 hours of outage yesterday. An exception escaped and caused the entire server to go down. Measures are taken to ensure that won't happen again. 81 | 82 | ### 2022-6-5 83 | 84 | The crawler hang is (very largely) fixed. This a stupid error I made causing circular references. The automated re-indexing will be reactivated in the near future. Up to this point I had to do it manually after discovering the hang. 85 | 86 | ### 2022-6-2 87 | 88 | Added LEO ring endpoints and search engine metadata pages to the crawler blacklist. This should improve search speed and quality. 89 | 90 | ### 2022-5-9 91 | 92 | Updated robots.txt parsing to be more robust. Now there shall be less issues caused by non-standard robots.txt files. Namely added support for case insensitive keys and leading whitespace handling. 93 | 94 | ### 2022-4-29 95 | 96 | The server has been upgraded to Ubuntu 22.04 with Linux 4.15! With that, we are able to use the landlock kernel feature to prevent attackers from executing any commands/access arbitrary files through any server exploits. Drogon is also updated to 1.7.5 97 | 98 | ### 2022-4-22 99 | 100 | Just a headsup. The server will be upgraded to Ubuntu 22.04 sometime next week or the week after. Expect a few hours of downtime. Things should be back to normal after that. Also I'm planning to migrate to OpenBSD for better security once they adopted clang-14. 101 | 102 | ### 2022-4-16 103 | 104 | Updated dremini and security measures. Improving speed of the HTTP version of this capsule. And as you might have noticed. Search results are now highlighted with [] so it's easier to see what you're looking for. 105 | 106 | ### 2022-2-6 107 | 108 | At last the crawler can finish without performance degradation and attendence. From now on crawling will happen automatically every Wednesday and Sunday 00:00 UTC. 109 | 110 | Also security update. Due to some weird firewall logs. Which I suspect are simply bots poking around, but anyways. TLGS server now runs on GrapheneOS's hardened_malloc and with much less privilege. I think my code is secure. But better safe than sorry. (Man, I'd love to have unveil and pledge on Linux). 111 | 112 | ### 2022-2-2 113 | 114 | I switched the ranking algorithm from HITS to SALSA after some optimization and bug fixes. It is faster than HITS and surfaces more relevant pages. Also very pround that gemini.circumlunar.space is finally the first link in the search result when I search for "gemini". Followed by other very popular capsules like medusae.space! 115 | 116 | ### 2022-1-9 117 | 118 | Happy new year! The search engine can decuplicate search results. Hopefully it improes the user experience. Let me know if it hilds important results for you. It shouldn't. But who knows. maybe a bug is hiding. 119 | 120 | ### 2021-12-27 121 | 122 | The crawler is updated to handle common wildcard patterns and server stablity is improved. Hopefully redusing trobles people have with TLGS. 123 | 124 | ### 2021-12-4 125 | 126 | TLGS has a common-web interface! You can search content on Gemini from common web. It does not proxy content from outside of TLGS's site. You still need a Gemini browser to browse searched result. 127 | => https://tlgs.one/ 128 | 129 | ### 2021-12-1 130 | 131 | Running well. However it cannot finish crawling without attendence. Likely some bug in the crawler. If you see random downtime of TLGS recently, that's likely me fixing stuff. 132 | 133 | ### 2021-11-17 134 | 135 | Going public! Hope this goes well 136 | 137 | ### 2021-11-13 138 | 139 | Tried a full-crawl of the Geminispace. Currently only crawls when I want to. If this goes well will move to a VPS and open it for public use. 140 | 141 | ### 2021-11-7 142 | 143 | Test deployment in my homelab. Running with 40K pages in index. Response time is acceptable with a bloody slow CPU. 144 | -------------------------------------------------------------------------------- /tlgs/crawler/blacklist.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | bool inBlacklist(const std::string& url_str) 10 | { 11 | tlgs::Url url(url_str); 12 | // TODO: Make this live on SQL 13 | static const std::set blacklist_domains = { 14 | // example sites 15 | "example.com", 16 | "example.org", 17 | "example.net", 18 | "example.io", 19 | "example.us", 20 | "example.eu", 21 | "example.gov", 22 | "example.space", 23 | // localhosts, 127.0.0.x is handled separately 24 | "localhost", 25 | "[::1]", 26 | // Apprantly this is a valid domain on Ubuntu 27 | "localhost.localdomain", 28 | // Known sites to be down and won't be back 29 | "gus.guru", 30 | "ftrv.se", 31 | "git.thebackupbox.net", 32 | "mikelynch.org", 33 | "campaignwiki.org" // Likely configration error 34 | }; 35 | 36 | static const std::vector blacklist_urls = { 37 | // Mostly imported from geminispace.info. 38 | 39 | // vvvvvvvvvvvvv GUS blacklist vvvvvvvvvvvvv 40 | "gemini://www.youtube.com/", 41 | 42 | // all combinations of a tictactoe board 43 | "gemini://tictactoe.lanterne.chilliet.eu", 44 | 45 | // serving big files and slooow capsule -> takes to long to crawl 46 | "gemini://kamalatta.ddnss.de/", 47 | "gemini://tweek.zyxxyz.eu/valentina/", 48 | 49 | // ASCII art with emulated modem speed 50 | "gemini://ansi.hrtk.in/", 51 | "gemini://matrix.kiwifarms.net", 52 | 53 | // ZachDeCooks songs 54 | "gemini://songs.zachdecook.com/song.gmi.php/", 55 | "gemini://songs.zachdecook.com/chord.svg/", 56 | "gemini://gemini.zachdecook.com/cgi-bin/ccel.sh", 57 | 58 | // breaks crawl due to recursion overflow 59 | "gemini://cadence.moe/chapo/", 60 | 61 | "gemini://nixo.xyz/reply/", 62 | "gemini://nixo.xyz/notify", 63 | "gemini://gemini.thebackupbox.net/queryresponse", 64 | "gemini://gem.garichankar.com/share_audio", 65 | 66 | // Mastodon mirror 67 | "gemini://vps01.rdelaage.ovh/", 68 | "gemini://mastogem.picasoft.net", 69 | 70 | // various failing resources on runjimmyrunrunyoufuckerrun.com 71 | "gemini://runjimmyrunrunyoufuckerrun.com/fonts/", 72 | "gemini://runjimmyrunrunyoufuckerrun.com/tmp/", 73 | 74 | // Search providers 75 | "gemini://houston.coder.town/search?", 76 | "gemini://houston.coder.town/search/", 77 | "gemini://marginalia.nu/search", 78 | "gemini://kennedy.gemi.dev/cached?", 79 | 80 | // Geddit 81 | "gemini://geddit.pitr.ca/post?", 82 | "gemini://geddit.pitr.ca/c/", 83 | "gemini://geddit.glv.one/post?", 84 | "gemini://geddit.glv.one/c/", 85 | 86 | // Marmaladefoo calculator 87 | "gemini://gemini.marmaladefoo.com/cgi-bin/calc.cgi?", 88 | "gemini://gemini.circumlunar.space/users/fgaz/calculator/", 89 | 90 | // Individual weather pages 91 | "gemini://acidic.website/cgi-bin/weather.tcl?", 92 | "gemini://caolan.uk/weather/", 93 | 94 | // Alex Schroeder's problematic stuff 95 | "gemini://alexschroeder.ch/image_external", 96 | "gemini://alexschroeder.ch/html/", 97 | "gemini://alexschroeder.ch/diff/", 98 | "gemini://alexschroeder.ch/history/", 99 | "gemini://alexschroeder.ch/http", 100 | "gemini://alexschroeder.ch/https", 101 | "gemini://alexschroeder.ch/tag/", 102 | "gemini://alexschroeder.ch/raw/", 103 | "gemini://alexschroeder.ch/map/", 104 | "gemini://alexschroeder.ch/do/comment", 105 | "gemini://alexschroeder.ch/do/rc", 106 | "gemini://alexschroeder.ch/do/rss", 107 | "gemini://alexschroeder.ch/do/new", 108 | "gemini://alexschroeder.ch/do/more", 109 | "gemini://alexschroeder.ch/do/tags", 110 | "gemini://alexschroeder.ch/do/match", 111 | "gemini://alexschroeder.ch/do/search", 112 | "gemini://alexschroeder.ch/do/gallery/", 113 | 114 | // mozz mailing list linkscraper 115 | "gemini://mozz.us/files/gemini-links.gmi", 116 | "gemini://gem.benscraft.info/mailing-list", 117 | "gemini://rawtext.club/~sloum/geminilist", 118 | // gemini.techrights.org 119 | "gemini://gemini.techrights.org/", 120 | 121 | // youtube mirror 122 | "gemini://pon.ix.tc/cgi-bin/youtube.cgi?", 123 | "gemini://pon.ix.tc/youtube/", 124 | 125 | // news mirrors - not our business 126 | // TLGS can handle some news. Let's keep them for now 127 | // "gemini://guardian.shit.cx/", // NOTE: at least try to index one new site! 128 | "gemini://taz.de/", 129 | "gemini://simplynews.metalune.xyz", 130 | "gemini://illegaldrugs.net/cgi-bin/news.php?", 131 | "gemini://illegaldrugs.net/cgi-bin/reader", 132 | "gemini://rawtext.club/~sloum/geminews", 133 | "gemini://gemini.cabestan.tk/hn", 134 | "gemini://hn.filiuspatris.net/", 135 | "gemini://schmittstefan.de/de/nachrichten/", 136 | "gemini://gmi.noulin.net/mobile", 137 | "gemini://jpfox.fr/rss/", 138 | "gemini://illegaldrugs.net/cgi-bin/news.php/", 139 | "gemini://dw.schettler.net/", 140 | "gemini://dioskouroi.xyz/top", 141 | "gemini://drewdevault.com/cgi-bin/hn.py", 142 | "gemini://tobykurien.com/maverick/", 143 | "gemini://gemini.knusbaum.com", 144 | 145 | // wikipedia proxy 146 | "gemini://wp.pitr.ca/", 147 | "gemini://wp.glv.one/", 148 | "gemini://wikipedia.geminet.org/", 149 | "gemini://wikipedia.geminet.org:1966", 150 | "gemini://vault.transjovian.org/", 151 | 152 | // client torture test 153 | "gemini://egsam.pitr.ca/", 154 | "gemini://egsam.glv.one/", 155 | "gemini://gemini.conman.org/test", 156 | 157 | // mozz's chat 158 | "gemini://chat.mozz.us/stream", 159 | "gemini://chat.mozz.us/submit", 160 | 161 | // gopher proxy 162 | "gemini://80h.dev/agena/", 163 | 164 | // astrobotany 165 | "gemini://astrobotany.mozz.us/app/", 166 | "gemini://carboncopy.xyz/cgi-bin/apache.gex/", 167 | 168 | // susa.net 169 | "gemini://gemini.susa.net/cgi-bin/search?", 170 | "gemini://gemini.susa.net/cgi-bin/twitter?", 171 | "gemini://gemini.susa.net/cgi-bin/vim-search?", 172 | "gemini://gemini.susa.net/cgi-bin/links_stu.lua?", 173 | 174 | "gemini://gemini.spam.works/textfiles/", 175 | "gemini://gemini.spam.works/mirrors/textfiles/", 176 | "gemini://gemini.spam.works/users/dvn/archive/", 177 | 178 | // streams that never end... 179 | "gemini://gemini.thebackupbox.net/radio", 180 | "gemini://higeki.jp/radio", 181 | 182 | // full web proxy 183 | "gemini://drewdevault.com/cgi-bin/web.sh?", 184 | "gemini://gemiprox.pollux.casa/", 185 | "gemini://gemiprox.pollux.casa:1966", 186 | "gemini://ecs.d2evs.net/proxy/", 187 | "gemini://gmi.si3t.ch/www-gem/", 188 | "gemini://orrg.clttr.info/orrg.pl", 189 | "gemini://mysidard.com/services/hackernews/", 190 | 191 | // killing crawl, I think maybe because it's too big 192 | // cryptocurrency bullshit 193 | "gemini://gem.denarii.cloud/", 194 | 195 | // docs - not our business 196 | "gemini://cfdocs.wetterberg.nu/", 197 | "gemini://godocs.io", 198 | "gemini://emacswiki.org/", 199 | 200 | // ^^^^^^^ GUS blacklist ^^^^^^^ 201 | // vvvvvvv TLGS blacklist vvvvvvv 202 | 203 | // He doen't like bots. As your wish (Just put up a robots.txt) 204 | "gemini://alexschroeder.ch/", 205 | 206 | // Code, RFC, man page 207 | "gemini://si3t.ch/code/", 208 | "gemini://tilde.club/~filip/library/", 209 | "gemini://gemini.bortzmeyer.org/rfc-mirror/", 210 | "gemini://chris.vittal.dev/rfcs", 211 | "gemini://going-flying.com/git/cgi/gemini.git/", 212 | "gemini://szczezuja.flounder.online/git/", 213 | "gemini://gmi.noulin.net/rfc", 214 | "gemini://gmi.noulin.net/man", 215 | "gemini://hellomouse.net/user-pages/handicraftsman/ietf/", 216 | "gemini://tilde.team/~orichalcumcosmonaut/darcs/website/prod/", 217 | "gemini://gemini.omarpolo.com/cgi", 218 | "gemini://gemini.rmf-dev.com", 219 | 220 | 221 | // Archives 222 | "gemini://musicbrainz.uploadedlobster.com/", 223 | "gemini://musicbrainz.uploadedlobster.com/", 224 | "gemini://gemini.lost-frequencies.eu/posts/archive", 225 | "gemini://blitter.com/", 226 | "gemini://ake.crabdance.com:1966/message/", 227 | "gemini://iceworks.cc/z/", 228 | "gemini://ake.crabdance.com:1966/channel/", 229 | "gemini://gemini.autonomy.earth/posts/", 230 | "gemini://lists.flounder.online/gemini/threads/messages/", 231 | "gemini://tilde.pink/~bencollver/gamefaqs/", 232 | "gemini://tilde.pink/~bencollver/gamefaq/", 233 | "gemini://gemini.quux.org/0/Archives", 234 | 235 | // Songs? 236 | "gemini://gemini.rob-bolton.co.uk/songs", 237 | 238 | // Text based game 239 | "gemini://gthudson.xyz/cgi-bin/quietplace.cgi", 240 | "gemini://futagoza.gamiri.com/gmninkle/", 241 | "gemini://alexey.shpakovsky.ru/maze", 242 | "gemini://jsreed5.org/live/cgi-bin/twisty/", 243 | 244 | // dead capsule 245 | "gemini://gemini.theuse.net/", 246 | 247 | // Infine stream of data - we timeout but not useful to index 248 | "gemini://202x.moe/resonance", 249 | 250 | // Redirection + some infinate recursion somewhere 251 | "gemini://gmi.skyjake.fi/", 252 | "gemini://warmedal.se/.well-known/", 253 | 254 | // Non standard conforming robots.txt 255 | "gemini://www.bonequest.com/", 256 | 257 | // meta info from search engines 258 | "gemini://kennedy.gemi.dev/page-info?id=", 259 | "gemini://gemi.dev/xkcd", 260 | "gemini://gemi.dev/cgi-bin/", 261 | 262 | // Large book archive and causing OOM 263 | "gemini://gemlog.stargrave.org/", 264 | 265 | // Large pile of (to us) useless data 266 | "gemini://jsreed5.org/oeis/", 267 | }; 268 | 269 | static tlgs::UrlBlacklist blacklist; 270 | static std::once_flag blacklist_init; 271 | std::call_once(blacklist_init, [&]() { 272 | for (auto& url : blacklist_urls) 273 | blacklist.add(url); 274 | }); 275 | // If the domain or URL is blacklisted, we don't want to index it 276 | if(blacklist_domains.contains(url.host())) 277 | return true; 278 | if(blacklist.isBlocked(url.str())) 279 | return true; 280 | 281 | // hardcoeded: don't crawl from localhost or uneeded files 282 | if(url.path() == "/robots.txt" || url.path() == "/favicon.txt") 283 | return true; 284 | // The entire 127.0.0.1/24 subnet 285 | if(url.host().starts_with("127.0.0.")) 286 | return true; 287 | // ending with .local or .localhost 288 | if(url.host().ends_with(".local") || url.host().ends_with(".localhost") || url.host().ends_with(".localdomain")) 289 | return true; 290 | 291 | // Ignore all potential git repos 292 | if(url.path().starts_with("/git/")) 293 | return true; 294 | if(url.host().starts_with("git.")) 295 | return true; 296 | if(url.str().find(".git/tree/") != std::string::npos) 297 | return true; 298 | if(url.str().find(".git/blob/") != std::string::npos) 299 | return true; 300 | if(url.str().ends_with("/git.sh")) 301 | return true; 302 | // XKCD archives 303 | if(url.str().find("/~xkcd/") != std::string::npos) 304 | return true; 305 | if(url.str().find("/xkcd/") != std::string::npos) 306 | return true; 307 | // LEO (Low Earth Orbit) webring. These affect how well ranking works 308 | if(url.path().ends_with("/next.cgi") || url.path().ends_with("/prev.cgi") || url.path().ends_with("/rand.cgi")) 309 | return true; 310 | // Other orbits 311 | if(url.path().ends_with("/next") || url.path().ends_with("/prev") || url.path().ends_with("/rand") 312 | || url.path().ends_with("/next.gmi") || url.path().ends_with("/prev.gmi") || url.path().ends_with("/rand.gmi")) 313 | return true; 314 | 315 | // We don't have the ablity crawl hidden sites, yet 316 | if(url.host().ends_with(".onion")) 317 | return true; 318 | 319 | // seems to be a sign of common gopher proxy 320 | if(url.str().find("gopher:/:/") != std::string::npos) 321 | return true; 322 | if(url.str().find("rfc-mirror") != std::string::npos) 323 | return true; 324 | // links should not contain ASCII control characters 325 | if(auto url_str = url.str(); 326 | std::find_if(url_str.begin(), url_str.end(), [](char c) { return c >= 0 && c < 32; }) != url_str.end()) 327 | return true; 328 | 329 | // Avoid wrongly redirected URLs like gemini://www.example.com/cgi/cgi/cgi/cgi... 330 | // We allow 2 same path components as /image/gemlog/2020/images sounds like a legit path 331 | auto path = std::filesystem::path(url.path()); 332 | if(std::distance(path.begin(), path.end()) >= 3) { 333 | std::unordered_map dirname_count; 334 | for(const auto& part : path) { 335 | if(++dirname_count[part.generic_string()] >= 3) 336 | return true; 337 | } 338 | } 339 | 340 | //XXX: half working way to detect commits 341 | auto n = url.str().find("commits/"); 342 | if(n != std::string::npos) { 343 | static const std::regex re("commits/[a-z0-9A-Z]+[/\\.].*"); 344 | std::smatch sm; 345 | std::string commit = url.str().substr(n); 346 | if(std::regex_match(commit, sm, re)) 347 | return true; 348 | } 349 | 350 | return false; 351 | } 352 | -------------------------------------------------------------------------------- /tlgs/crawler/crawler.cpp: -------------------------------------------------------------------------------- 1 | #include "crawler.hpp" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include 24 | 25 | #include "iconv.hpp" 26 | #include "blacklist.hpp" 27 | 28 | #include 29 | 30 | using namespace drogon; 31 | using namespace dremini; 32 | using namespace trantor; 33 | 34 | static std::string pgSQLRealEscape(std::string str) 35 | { 36 | drogon::utils::replaceAll(str, "\\", "\\\\"); 37 | drogon::utils::replaceAll(str, std::string(1, '\0'), "\\0"); 38 | drogon::utils::replaceAll(str, "\n", "\\n"); 39 | drogon::utils::replaceAll(str, "\r", "\\r"); 40 | drogon::utils::replaceAll(str, "'", "''"); 41 | drogon::utils::replaceAll(str, "\"", "\\\""); 42 | drogon::utils::replaceAll(str, "\x1a", "\\Z"); 43 | return str; 44 | } 45 | 46 | static std::string tryConvertEncoding(const std::string_view& str, const std::string& src_enc, const std::string& dst_enc, bool ignore_err = true) 47 | { 48 | // still perform conversion event if source encoding is the same as destination encoding 49 | // because the input string might have bad encoding 50 | std::string res; 51 | try { 52 | iconvpp::converter converter(src_enc, dst_enc, true); 53 | converter.convert(str, res); 54 | } 55 | catch(...) { 56 | res = str; 57 | } 58 | return res; 59 | } 60 | 61 | static std::pair> parseMime(const std::string& mime) 62 | { 63 | std::string mime_str; 64 | std::unordered_map params; 65 | if(mime.empty()) 66 | return {mime_str, params}; 67 | 68 | size_t idx = 0; 69 | for(;idx> GeminiCrawler::getNextPotentialCarwlUrl() 91 | { 92 | std::string result; 93 | if(craw_queue_.try_pop(result)) 94 | co_return result; 95 | 96 | // co_return {}; 97 | 98 | // Even if multiple threads are trying to aquire data from the same table, they push into the same queue. Thus 99 | // we can safely stop querying if we got data in the queue. (as long as LIMIT >> max_concurrent_connections_) 100 | static std::atomic sample_pct{1}; 101 | constexpr int urls_per_batch = 360; 102 | while(craw_queue_.empty()) { 103 | auto db = app().getDbClient(); 104 | try { 105 | int tablesample_pct = sample_pct.load(std::memory_order_acquire); 106 | // HACK: Seems we can't pass bind variables to a subquery, Just compose the query string 107 | std::string sample_str; 108 | if(tablesample_pct <= 80) 109 | sample_str = fmt::format("TABLESAMPLE SYSTEM({})", tablesample_pct); 110 | auto urls = co_await db->execSqlCoro(fmt::format("UPDATE pages SET last_queued_at = CURRENT_TIMESTAMP " 111 | "WHERE url in (SELECT url FROM pages {} WHERE (last_crawled_at < CURRENT_TIMESTAMP - INTERVAL '3' DAY " 112 | "OR last_crawled_at IS NULL) AND (last_queued_at < CURRENT_TIMESTAMP - INTERVAL '5' MINUTE OR last_queued_at IS NULL) " 113 | "LIMIT {} FOR UPDATE) RETURNING url" 114 | ,sample_str , urls_per_batch)); 115 | if(urls.size() < urls_per_batch*0.9) { 116 | const int new_value = std::min(tablesample_pct + 10, 100); 117 | sample_pct.compare_exchange_strong(tablesample_pct, new_value, std::memory_order_acq_rel); 118 | } 119 | if(urls.size() == 0 && tablesample_pct >= 100) 120 | co_return {}; 121 | 122 | thread_local std::mt19937 rng(std::random_device{}()); 123 | std::vector vec; 124 | vec.reserve(urls.size()); 125 | for(const auto& url : urls) 126 | vec.push_back(url["url"].as()); 127 | // XXX: Half-working attempt at randomizing the crawling order. 128 | std::shuffle(vec.begin(), vec.end(), rng); 129 | for(auto&& url : vec) 130 | craw_queue_.emplace(std::move(url)); 131 | } 132 | catch(std::exception& e) { 133 | // Only keep trying if is a transaction rollback 134 | if(std::string_view(e.what()).find("deadlock") == std::string_view::npos && 135 | std::string_view(e.what()).find("transaction") == std::string_view::npos) { 136 | throw; 137 | } 138 | LOG_INFO << "Query for next URL failed due to transaction rollback. Retrying..."; 139 | } 140 | } 141 | 142 | if(craw_queue_.try_pop(result)) 143 | co_return result; 144 | co_return {}; 145 | } 146 | 147 | 148 | Task GeminiCrawler::shouldCrawl(std::string url_str) 149 | { 150 | if(url_str.empty()) 151 | co_return false; 152 | const auto url = tlgs::Url(url_str); 153 | if(url.good() == false) { 154 | LOG_ERROR << "Failed to parse URL " << url_str; 155 | co_return false; 156 | } 157 | if(url.protocol() != "gemini") { 158 | LOG_ERROR << url_str << " is not a Gemini URL"; 159 | co_return false; 160 | } 161 | if(inBlacklist(url.str())) 162 | co_return false; 163 | // Do not crawl hosts known to be down 164 | // TODO: Put this on SQL 165 | auto timeout = host_timeout_count_.find(url.hostWithPort(1965)); 166 | if(timeout != host_timeout_count_.end() && timeout->second > 3) 167 | co_return false; 168 | 169 | // TODO: Use a LRU cache 170 | // Consult the database to see if this URL is in robots.txt. Contents from the DB is cache locally to 171 | // redule the number of DB queries 172 | const std::string cache_key = url.hostWithPort(1965); 173 | static drogon::CacheMap> policy_cache(loop_, 5); 174 | std::vector disallowed_path; 175 | if(policy_cache.findAndFetch(cache_key, disallowed_path)) 176 | co_return !tlgs::isPathBlocked(url.path(), disallowed_path); 177 | 178 | LOG_TRACE << "Cannot find " << cache_key << " in local policy cache"; 179 | auto db = app().getDbClient(); 180 | auto policy_status = co_await db->execSqlCoro("SELECT have_policy FROM robot_policies_status " 181 | "WHERE host = $1 AND port = $2 AND last_crawled_at > CURRENT_TIMESTAMP - INTERVAL '2' DAY", url.host(), url.port()); 182 | if(policy_status.size() == 0) { 183 | LOG_TRACE << url.hostWithPort(1965) << " has no up to date robots policy stored in DB. Asking the host for robots.txt"; 184 | HttpResponsePtr resp; 185 | // FIXME: THe crawler may request robots.txt multiple times if multiple URLs on the same host are requested. 186 | // This is not as efficient as it could be. But does not cause any problems otherwise. 187 | try { 188 | std::string robot_url = tlgs::Url(url).withParam("").withPath("/robots.txt").withFragment("").str(); 189 | LOG_TRACE << "Fetching robots.txt from " << robot_url; 190 | resp = co_await dremini::sendRequestCoro(robot_url, 10, loop_, 0x2625a0, {}, 10); 191 | } 192 | catch(std::exception& e) { 193 | // XXX: Failed to handshake with the host. We should retry later 194 | // Shoud we cache the result as no policy is available? 195 | // policy_cache[cache_key] = {}; 196 | std::string error = e.what(); 197 | if(error == "Timeout" || error == "NetworkFailure") 198 | host_timeout_count_[url.hostWithPort(1965)]++; 199 | co_return true; 200 | } 201 | 202 | assert(resp != nullptr); 203 | auto [mime, _] = parseMime(resp->contentTypeString()); 204 | int status = std::stoi(resp->getHeader("gemini-status")); 205 | // HACK: Some capsules have broken MIME 206 | bool have_robots_txt = status == 20 && (mime == "text/plain" || mime == "text/gemini"); 207 | if(have_robots_txt) { 208 | disallowed_path = tlgs::parseRobotsTxt(std::string(resp->body()), {"*", "tlgs", "indexer"}); 209 | } 210 | 211 | try { 212 | auto t = co_await db->newTransactionCoro(); 213 | co_await t->execSqlCoro("DELETE FROM robot_policies WHERE host = $1 AND port = $2;", url.host(), url.port()); 214 | for(const auto& disallow : disallowed_path) { 215 | co_await t->execSqlCoro("INSERT INTO robot_policies (host, port, disallowed) VALUES ($1, $2, $3)", 216 | url.host(), url.port(), disallow); 217 | } 218 | co_await t->execSqlCoro("INSERT INTO robot_policies_status(host, port, last_crawled_at, have_policy) VALUES ($1, $2, CURRENT_TIMESTAMP, $3) " 219 | "ON CONFLICT (host, port) DO UPDATE SET last_crawled_at = CURRENT_TIMESTAMP, have_policy = $3;" 220 | , url.host(), url.port(), have_robots_txt); 221 | } 222 | catch(...) { 223 | // Screw it. Someone else updated the policies. They've done the same job. We can keep on working 224 | } 225 | } 226 | else if(policy_status[0]["have_policy"].as()) { 227 | LOG_TRACE << url.hostWithPort(1965) << " has robots policy stored in DB."; 228 | auto stored_policy = co_await db->execSqlCoro("SELECT disallowed FROM robot_policies WHERE host = $1 AND port = $2;" 229 | , url.host(), url.port()); 230 | for(const auto& path : stored_policy) 231 | disallowed_path.push_back(path["disallowed"].as()); 232 | } 233 | 234 | bool should_crawl = !tlgs::isPathBlocked(url.path(), disallowed_path); 235 | policy_cache.insert(cache_key, std::move(disallowed_path), 60); 236 | co_return should_crawl; 237 | } 238 | 239 | Task> GeminiCrawler::getNextCrawlPage() 240 | { 241 | auto db = app().getDbClient(); 242 | while(1) { 243 | auto next_url = co_await getNextPotentialCarwlUrl(); 244 | if(next_url.has_value() == false) 245 | co_return {}; 246 | 247 | auto url_str = next_url.value(); 248 | 249 | // URL should not contain any ASCII control characters 250 | auto it = std::find_if(url_str.begin(), url_str.end(), [](char c) { return c < 0x20; }); 251 | auto can_crawl = it == url_str.end() && co_await shouldCrawl(url_str); 252 | if(can_crawl == false) { 253 | co_await db->execSqlCoro("UPDATE pages SET last_crawled_at = CURRENT_TIMESTAMP, last_status = $2, last_meta = $3 WHERE url = $1;" 254 | , url_str, 0, std::string("blocked")); 255 | co_await db->execSqlCoro("DELETE FROM pages WHERE url = $1 AND last_crawl_success_at < CURRENT_TIMESTAMP - INTERVAL '30' DAY;" 256 | , url_str); 257 | continue; 258 | } 259 | 260 | // shouldCrawl() validates the URL. So we can safely use tlgs::Url here 261 | co_return tlgs::Url(url_str).str(); 262 | } 263 | 264 | LOG_FATAL << "Should not reach here in Crawler::getNextCrawlPage()"; 265 | co_return {}; 266 | } 267 | 268 | void GeminiCrawler::dispatchCrawl() 269 | { 270 | if(ended_) 271 | return; 272 | // In case I forgot how this works in the future: 273 | // Start new crawls up to max_concurrent_connections_. And launch new crawls when we might have more pages to crawl. 274 | // This function is tricky and difficult to understand. It's a bit of a hack. But it is 100% lock free. The general idea is asi 275 | // follows: 276 | // 1. A atomic counter is used to keep track of the number of active crawls. 277 | // 2. If we successfully got new url to crawl, we spawn dispatch a new crawl. 278 | // 3. When a crawl finishes, we resubmit the crawl task. 279 | // * Keep re-submiting crawls until there's no more to crawl 280 | // * It doesn't matter if currently enough crawler exists. Since the counter stops them from crawling 281 | // 4. Near the end of crawling. It might be possible that there's not enough pages to be crawled 282 | // * But a crawler may suddenly submit more links for crawling. 283 | // * The nature of dispatching more and more crawlers help. Reactivating crawls if neded. 284 | auto counter = std::make_shared(ongoing_crawlings_); 285 | if(counter->count() >= max_concurrent_connections_) 286 | return; 287 | 288 | async_run([counter, this]() mutable -> Task {try{ 289 | auto url_str = co_await getNextCrawlPage(); 290 | // Crawling has ended if the following is true 291 | // 1. There's no more URL to crawl 292 | // 2. The current crawl is the last one in existance 293 | // * Since a crawler can add new items into the queue 294 | if(url_str.has_value() == false) { 295 | if(counter->release() == 1) 296 | ended_ = true; 297 | co_return; 298 | } 299 | loop_->runInLoop([this](){dispatchCrawl();}); 300 | 301 | try { 302 | bool success = co_await crawlPage(url_str.value()); 303 | if(success) 304 | LOG_INFO << "Processed " << url_str.value(); 305 | // else // we already print out the error message in crawlPage() 306 | // LOG_ERROR << "Failed to process " << url_str.value(); 307 | } 308 | catch(std::exception& e) { 309 | LOG_ERROR << "Exception escaped crawling "<< url_str.value() <<": " << e.what(); 310 | abort(); 311 | } 312 | loop_->queueInLoop([this](){dispatchCrawl();}); 313 | } 314 | catch(std::exception& e) { 315 | LOG_ERROR << "Exception escaped in dispatchCrawl(): " << e.what(); 316 | dispatchCrawl(); 317 | }}); 318 | } 319 | 320 | Task GeminiCrawler::crawlPage(const std::string& url_str) 321 | { 322 | auto db = app().getDbClient(); 323 | const auto url = tlgs::Url(url_str); 324 | if(url.good() == false || url.str() != url_str) { 325 | // It's fine we delete unnormalized URLs since the crawler will just add them back later when encounter it again 326 | LOG_WARN << "Warning: URL " << url_str << " is not normalized or invalid. Removing it from the queue."; 327 | co_await db->execSqlCoro("DELETE FROM pages WHERE url = $1", url_str); 328 | co_await db->execSqlCoro("DELETE FROM links WHERE url = $1 OR to_url = $1", url_str); 329 | co_return false; 330 | } 331 | 332 | std::string error; 333 | try { 334 | if(co_await shouldCrawl(url.str()) == false) 335 | throw std::runtime_error("Blocked by robots.txt"); 336 | auto record = co_await db->execSqlCoro("SELECT url, indexed_content_hash , raw_content_hash, last_status" 337 | ", last_crawled_at FROM pages WHERE url = $1;", url.str()); 338 | bool have_record = record.size() != 0; 339 | auto indexed_content_hash = have_record ? record[0]["indexed_content_hash"].as() : ""; 340 | auto raw_content_hash = have_record ? record[0]["raw_content_hash"].as() : ""; 341 | 342 | if(!have_record) { 343 | co_await db->execSqlCoro("INSERT INTO pages(url, domain_name, port, first_seen_at)" 344 | " VALUES ($1, $2, $3, CURRENT_TIMESTAMP);", 345 | url.str(), url.host(), url.port()); 346 | } 347 | else { 348 | // 53 proxy error. Likely misconfigured proxy/domain or bad links pointing to the wrong domain that is on the 349 | // smae IP. Only retry once every 21 days 350 | auto last_status = record[0]["last_status"].isNull() ? 0 : record[0]["last_status"].as(); 351 | auto last_crawled_at = [&](){ 352 | auto var = record[0]["last_crawled_at"]; 353 | if(var.isNull()) 354 | return trantor::Date(); 355 | else 356 | return trantor::Date::fromDbStringLocal(var.as()); 357 | }(); 358 | 359 | if(last_status == 53 && last_crawled_at.after(21*7*24*3600) < trantor::Date::now()) { 360 | co_await db->execSqlCoro("UPDATE pages SET last_crawled_at = CURRENT_TIMESTAMP, last_status = 0 WHERE url = $1;", url.str()); 361 | LOG_INFO << "Skipping " << url.str() << " that was proxy-errored recently"; 362 | co_return true; 363 | } 364 | } 365 | 366 | // Only fetch the entire page for content types we can handle. 367 | static const std::vector indexd_mimes = {"text/gemini", "text/plain", "text/markdown", "text/x-rst", "plaintext"}; 368 | HttpResponsePtr resp; 369 | int redirection_count = 0; 370 | int status; 371 | tlgs::Url crawl_url = url; 372 | do { 373 | auto redirect = co_await db->execSqlCoro("SELECT to_url FROM perma_redirects WHERE from_url = $1;", crawl_url.str()); 374 | if(redirect.size() != 0) { 375 | crawl_url = tlgs::Url(redirect[0]["to_url"].as()); 376 | redirection_count++; 377 | status = 30; 378 | continue; 379 | } 380 | // 2.5MB is the maximum size of page we will index. 10s timeout, max 5 redirects and 25s max transfer time. 381 | resp = co_await dremini::sendRequestCoro(crawl_url.str(), 10, loop_, 0x2625a0, indexd_mimes, 25.0); 382 | 383 | status = std::stoi(resp->getHeader("gemini-status")); 384 | if(status / 10 == 3) { 385 | auto redirect_url = tlgs::Url(resp->getHeader("meta")); 386 | if(redirect_url.good() == false || crawl_url.str() == redirect_url.str()) 387 | throw std::runtime_error("Bad redirect"); 388 | if(url.protocol() != "gemini") 389 | throw std::runtime_error("Redirected to non-gemini URL"); 390 | if(co_await shouldCrawl(redirect_url.str()) == false) 391 | throw std::runtime_error("Redirected to blocked URL"); 392 | 393 | if(status == 31) { 394 | co_await db->execSqlCoro("INSERT INTO perma_redirects (from_url, to_url) VALUES ($1, $2) ON CONFLICT (from_url) DO UPDATE SET to_url = $2;", 395 | crawl_url.str(), redirect_url.str()); 396 | } 397 | crawl_url = std::move(redirect_url); 398 | } 399 | } while(status / 10 == 3 && redirection_count++ < 5); 400 | 401 | if(resp == nullptr) 402 | throw std::runtime_error("No concrete response. Too many redirects?"); 403 | 404 | const auto& meta = resp->getHeader("meta"); 405 | std::string mime; 406 | std::optional charset; 407 | std::optional lang; 408 | std::string title; 409 | std::string body; 410 | std::vector links; 411 | size_t body_size = resp->body().size(); 412 | std::optional feed_type; 413 | auto new_raw_content_hash = tlgs::xxHash64(resp->body()); 414 | if(status/10 == 2) { 415 | auto [mime_str, mime_param] = parseMime(meta); 416 | mime = std::move(mime_str); 417 | // trim leading and tailing space and tab from mime as some servers send it 418 | mime.erase(0, mime.find_first_not_of(" \t")); 419 | mime.erase(mime.find_last_not_of(" \t") + 1); 420 | charset = mime_param.count("charset") ? mime_param["charset"] : std::optional{}; 421 | lang = mime_param.count("lang") ? mime_param["lang"] : std::optional{}; 422 | 423 | // No reason to reindex if the content hasn't changed. `force_reindex_` is used to force reindexing of files 424 | if(force_reindex_ == false && raw_content_hash == new_raw_content_hash) { 425 | co_await db->execSqlCoro("UPDATE pages SET last_crawled_at = CURRENT_TIMESTAMP, last_crawl_success_at = CURRENT_TIMESTAMP, " 426 | "last_status = $2, last_meta = $3, content_type = $4 WHERE url = $1;", 427 | url.str(), status, meta, mime); 428 | co_return true; 429 | } 430 | 431 | // We should only have text files at this point. Try convert everything to UTF-8 because iconv will 432 | // ignore all encoding errors. Thus make Postgres happy for files with doggy encodings. 433 | std::string body_raw = tryConvertEncoding(resp->body(), charset.value_or("utf-8"), "utf-8"); 434 | // The worst case is 25% from UTF-32 to UTF-8. Smaller than 20% is definatelly a binary file. We don't want to index it. 435 | if(body_raw.size() < resp->body().size()/5) 436 | throw std::runtime_error("Possible binary files sent as text"); 437 | 438 | if(mime == "text/gemini") { 439 | auto nodes = dremini::parseGemini(body_raw); 440 | tlgs::GeminiDocument doc = tlgs::extractGeminiConcise(nodes); 441 | body = std::move(doc.text); 442 | links = std::move(doc.links); 443 | title = std::move(doc.title); 444 | if(tlgs::isGemsub(nodes, url, "gemini")) 445 | feed_type = "gemsub"; 446 | 447 | // remove empty links 448 | links.erase(std::remove_if(links.begin(), links.end(), [](const std::string& link) { 449 | return link.empty(); 450 | }), links.end()); 451 | if(title.empty()) 452 | title = url.str(); 453 | } 454 | else if(mime == "text/plain" || mime == "plaintext" || mime == "text/markdown" || mime == "text/x-rst") { 455 | if(url.path().ends_with("/twtxt.txt")) 456 | feed_type = "twtxt"; 457 | title = url.str(); 458 | body = std::move(body_raw); 459 | } 460 | else { 461 | if(mime == "application/rss+xml") 462 | feed_type = "rss"; 463 | else if(mime == "application/atom+xml") 464 | feed_type = "atom"; 465 | title = url.str(); 466 | body = ""; 467 | body_size = 0; 468 | } 469 | } 470 | else if(status/10 == 1) { 471 | body = meta; 472 | title = meta; 473 | body_size = meta.size(); 474 | mime = ""; 475 | } 476 | else { 477 | LOG_ERROR << "Failed to fetch " << url.str() << ": " << status; 478 | co_await db->execSqlCoro("UPDATE pages SET last_crawled_at = CURRENT_TIMESTAMP, last_status = $2, last_meta = $3 WHERE url = $1;" 479 | , url.str(), status, meta); 480 | co_await db->execSqlCoro("DELETE FROM pages WHERE url = $1 AND last_crawl_success_at < CURRENT_TIMESTAMP - INTERVAL '30' DAY;" 481 | , url.str()); 482 | co_return false; 483 | } 484 | // safeguard in case title is too long for Postgres 485 | if(title.size() > 1000) 486 | title = title.substr(0, 1000) + "..."; 487 | 488 | auto new_indexed_content_hash = tlgs::xxHash64(body); 489 | // Absolutelly no reason to reindex if the content hasn't changed even after post processing. 490 | if(new_indexed_content_hash == indexed_content_hash && new_raw_content_hash == raw_content_hash) { 491 | // Maybe this is too strict? The conent doesn't change means the content_type doesn't change, right...? 492 | co_await db->execSqlCoro("UPDATE pages SET last_crawled_at = CURRENT_TIMESTAMP, last_crawl_success_at = CURRENT_TIMESTAMP, " 493 | "last_status = $2, last_meta = $3, content_type = $4 WHERE url = $1;", 494 | url.str(), status, meta, mime); 495 | co_return true; 496 | } 497 | 498 | std::set link_urls; 499 | for(const auto& link : links) { 500 | // ignore links like mailto: ldap:. etc.. 501 | if(tlgs::isNonUriAction(link)) 502 | continue; 503 | 504 | auto link_url = tlgs::Url(link); 505 | if(link_url.good()) { 506 | if(link_url.protocol() == "") 507 | link_url.withProtocol(url.protocol()); 508 | if(link_url.protocol() != "gemini") 509 | continue; 510 | } 511 | // sometimes invalid host/port causes the URL to be invalid. Ignore them 512 | else if(link.starts_with("gemini://")) { 513 | continue; 514 | } 515 | // Drop links that are too long and obviously invalid 516 | else if(link.size() > 1024) { 517 | continue; 518 | } 519 | else { 520 | link_url = linkCompose(url, link); 521 | if(link_url.good() == false) 522 | continue; 523 | } 524 | // We shall not send fragments 525 | link_url.withFragment(""); 526 | 527 | // HACK: avoid mistyped links like gemini://en.gmn.clttr.info/cgmnlm.gmi?gemini://en.gmn.clttr.info/cgmnlm.gmi 528 | if(link_url.str().starts_with(link_url.param()) && link_url.path().ends_with(".gmi")) 529 | link_url.withParam(""); 530 | link_urls.insert(std::move(link_url)); 531 | } 532 | 533 | // TODO: Use C++20 ranges. My basic implementation is not as efficent as it could be. 534 | auto cross_site_links = tlgs::map(tlgs::filter(link_urls, [&url](const tlgs::Url& link_url) { 535 | return link_url.host() != url.host() || url.port() != link_url.port(); 536 | }) 537 | , [](const tlgs::Url& link_url) { 538 | return link_url.str(); 539 | }); 540 | auto internal_links = tlgs::map(tlgs::filter(link_urls, [&url](const tlgs::Url& link_url) { 541 | return !(link_url.host() != url.host() || url.port() != link_url.port()); 542 | }) 543 | , [](const tlgs::Url& link_url) { 544 | return link_url.str(); 545 | }); 546 | 547 | // TODO: Guess the language of the content. Then index them with different parsers 548 | co_await db->execSqlCoro("UPDATE pages SET content_body = $2, size = $3, charset = $4, lang = $5, last_crawled_at = CURRENT_TIMESTAMP, " 549 | "last_crawl_success_at = CURRENT_TIMESTAMP, last_status = $6, last_meta = $7, content_type = $8, title = $9, " 550 | "cross_site_links = $10::json, internal_links = $11::json, indexed_content_hash = $12, raw_content_hash = $13, feed_type = $14 WHERE url = $1;", 551 | url.str(), body, body_size, charset, lang, status, meta, mime, title, nlohmann::json(cross_site_links).dump() 552 | , nlohmann::json(internal_links).dump(), new_indexed_content_hash, new_raw_content_hash, feed_type); 553 | 554 | // Full text index update 555 | auto index_firendly_url = indexFriendly(url); 556 | co_await db->execSqlCoro("UPDATE pages SET search_vector = to_tsvector(REPLACE(title, '.', ' ') || ' ' || $2 || ' ' || content_body), " 557 | "title_vector = to_tsvector(REPLACE(title, '.', ' ') || ' ' || $2), last_indexed_at = CURRENT_TIMESTAMP WHERE url = $1;" 558 | , url.str(), index_firendly_url); 559 | if(internal_links.size() == 0 && cross_site_links.size() == 0) 560 | co_return true; 561 | 562 | // Update link formation 563 | // XXX: Drogon does not support bulk insert API. We have to do with string concatenation (with proper escaping) 564 | std::string link_query = "INSERT INTO links (url, host, port, to_url, is_cross_site, to_host, to_port) VALUES "; 565 | std::string page_query = "INSERT INTO pages (url, domain_name, port, first_seen_at) VALUES "; 566 | size_t page_count = 0; 567 | for(const auto& link_url : link_urls) { 568 | bool is_cross_site = link_url.host() != url.host() || url.port() != link_url.port(); 569 | 570 | link_query += fmt::format("('{}', '{}', {}, '{}', {}, '{}', {}), ", 571 | pgSQLRealEscape(url.str()), pgSQLRealEscape(url.host()), url.port(), pgSQLRealEscape(link_url.str()), 572 | is_cross_site, pgSQLRealEscape(link_url.host()), link_url.port()); 573 | 574 | if(co_await shouldCrawl(link_url.str()) == false) 575 | continue; 576 | page_query += fmt::format("('{}', '{}', {}, CURRENT_TIMESTAMP), ", 577 | pgSQLRealEscape(link_url.str()), pgSQLRealEscape(link_url.host()), link_url.port()); 578 | page_count++; 579 | } 580 | 581 | co_await db->execSqlCoro("DELETE FROM links WHERE url = $1", url.str()); 582 | co_await db->execSqlCoro(link_query.substr(0, link_query.size() - 2) + " ON CONFLICT DO NOTHING;"); 583 | if(page_count != 0) 584 | co_await db->execSqlCoro(page_query.substr(0, page_query.size() - 2) + " ON CONFLICT DO NOTHING;"); 585 | } 586 | catch(std::exception& e) { 587 | error = e.what(); 588 | } 589 | 590 | if(error == "Timeout" || error == "NetworkFailure") 591 | host_timeout_count_[url.hostWithPort(1965)]++; 592 | if(error != "") { 593 | co_await db->execSqlCoro("UPDATE pages SET last_crawled_at = CURRENT_TIMESTAMP, last_status = $2, last_meta = $3 WHERE url = $1;" 594 | , url.str(), 0, error); 595 | co_await db->execSqlCoro("DELETE FROM pages WHERE url = $1 AND last_crawl_success_at < CURRENT_TIMESTAMP - INTERVAL '30' DAY;" 596 | , url.str()); 597 | co_return false; 598 | } 599 | co_return true; 600 | } 601 | -------------------------------------------------------------------------------- /tlgsutils/trie.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // Code taken from https://github.com/kephir4eg/trie 4 | // Copyright (c) 2019, kephir4eg. Licensed under the MIT License. 5 | // Lastst modified, 2022 Martin Chang, MIT License 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace tlgs 18 | { 19 | 20 | struct SetCounter { }; 21 | 22 | namespace detail 23 | { 24 | 25 | template 26 | struct TrieNode : public PrefixHolderT 27 | { 28 | private: 29 | typedef TrieNode self_type; 30 | typedef self_type * self_pointer; 31 | 32 | self_pointer * data = nullptr; 33 | uint32_t size = 0; /* Will still be 64 bit due to alignment */ 34 | 35 | public: 36 | typedef self_type * const * map_iterator; 37 | 38 | TrieNode(int hint) { if (hint > 0) { resize(hint); } } 39 | 40 | inline static int atom_hash(AtomT x, uint32_t mask) { 41 | return (x & mask); 42 | } 43 | 44 | inline static int least_uncolliding_size(AtomT a, AtomT b) 45 | { 46 | unsigned int v = ((int) a ^ (int) b); 47 | return (v & -v) << 1; 48 | } 49 | 50 | static self_type * value(map_iterator x) { return *x; }; 51 | 52 | void resize(uint32_t new_size) 53 | { 54 | self_pointer * ndata = (new_size == 0) ? 55 | nullptr : (new self_pointer[new_size]); 56 | 57 | std::fill(ndata, ndata + new_size, nullptr); 58 | 59 | if (new_size > size) { 60 | for (uint32_t i = 0; i < size; ++i) { 61 | if (data[i] != nullptr) { 62 | ndata[atom_hash(*data[i]->kbegin(), new_size - 1)] = data[i]; 63 | } 64 | } 65 | } 66 | 67 | delete[] data; 68 | data = ndata; 69 | size = new_size; 70 | } 71 | 72 | map_iterator find(AtomT x) const 73 | { 74 | if (size != 0) 75 | { 76 | map_iterator result = data + atom_hash(x, size-1); 77 | 78 | if (nullptr != *result and (*result)->starts_with(x)) { 79 | return result; 80 | } 81 | } 82 | 83 | return nullptr; 84 | } 85 | 86 | ~TrieNode() { resize(0); }; 87 | 88 | void put(self_type * edge) 89 | { 90 | AtomT x = *edge->kbegin(); 91 | 92 | if (size == 0) { resize(2); } 93 | 94 | int hash = atom_hash(x, size-1); 95 | 96 | if (data[hash] == nullptr) { 97 | data[hash] = edge; 98 | return; 99 | } 100 | 101 | resize(least_uncolliding_size(x, *data[hash]->kbegin())); 102 | 103 | data[atom_hash(x, size-1)] = edge; 104 | } 105 | 106 | map_iterator begin() const { return data; } 107 | map_iterator end() const { return data + size; } 108 | std::nullptr_t nf() const { return nullptr; } 109 | 110 | void split(self_type * next, int breakIdx) 111 | { 112 | this->PrefixHolderT::psplit(next, breakIdx); 113 | std::swap(this->data, next->data); 114 | std::swap(this->size, next->size); 115 | this->swap_value(*next); 116 | put(next); 117 | } 118 | }; 119 | 120 | template 121 | struct TrieIteratorInternal 122 | { 123 | typedef std::vector< AtomT > key_type; 124 | typedef typename NodeT::map_iterator traverse_ptr; 125 | 126 | key_type base_prefix; 127 | const NodeT * m_root; 128 | std::vector m_ptrs; 129 | 130 | TrieIteratorInternal(const NodeT * a_root) : m_root(a_root) { }; 131 | 132 | const NodeT * get(int i = 0) const 133 | { 134 | int j = (int) m_ptrs.size() + i; 135 | return j > 0 ? NodeT::value(m_ptrs[j - 1]) : (j == 0 ? m_root : nullptr); 136 | } 137 | 138 | typename NodeT::value_type & get_value() const 139 | { 140 | NodeT * top = const_cast(m_ptrs.empty() ? m_root : NodeT::value(m_ptrs.back())); 141 | return top->get_value(); 142 | } 143 | 144 | std::basic_string get_key_str() 145 | { 146 | std::basic_string result(base_prefix.begin(), base_prefix.end()); 147 | 148 | const NodeT * i = m_root; 149 | 150 | std::copy(i->kbegin(), i->kend(), std::back_inserter(result)); 151 | 152 | for (auto && traverse_ptr : m_ptrs) 153 | { 154 | i = NodeT::value(traverse_ptr); 155 | std::copy(i->kbegin(), i->kend(), std::back_inserter(result)); 156 | } 157 | 158 | return result; 159 | } 160 | 161 | key_type get_key() 162 | { 163 | key_type result(base_prefix); 164 | 165 | const NodeT * i = m_root; 166 | result.insert(result.back(), i->kbegin(), i->kend()); 167 | 168 | for (auto && traverse_ptr : m_ptrs) 169 | { 170 | i = traverse_ptr->second; 171 | result.insert(result.back(), i->kbegin(), i->kend()); 172 | } 173 | 174 | return result; 175 | } 176 | 177 | bool step_down() 178 | { 179 | const NodeT * x = get(); 180 | traverse_ptr it = x->begin(); 181 | 182 | while (it != x->end() 183 | and NodeT::value(it) == nullptr) { 184 | ++it; 185 | } 186 | 187 | if (it != x->end()) 188 | { 189 | m_ptrs.push_back(it); 190 | return true; 191 | } 192 | 193 | return false; 194 | } 195 | 196 | bool step_fore() 197 | { 198 | const NodeT * up = get(-1); 199 | 200 | if (up != nullptr) 201 | { 202 | do { 203 | ++m_ptrs.back(); 204 | } while (m_ptrs.back() != up->end() 205 | and NodeT::value(m_ptrs.back()) == nullptr); 206 | 207 | return m_ptrs.back() != up->end(); 208 | } 209 | 210 | return false; 211 | } 212 | 213 | bool step_up() 214 | { 215 | m_ptrs.pop_back(); 216 | 217 | return step_fore(); 218 | } 219 | 220 | void next() 221 | { 222 | if (step_down()) { return; } 223 | if (step_fore()) { return; } 224 | 225 | while (!m_ptrs.empty()) { 226 | if (step_up()) { return; } 227 | } 228 | 229 | m_root = nullptr; 230 | } 231 | 232 | bool next_value() 233 | { 234 | do { 235 | next(); 236 | } while (m_root != nullptr and !get()->has_value()); 237 | 238 | return m_root != nullptr; 239 | } 240 | 241 | void push(traverse_ptr it) 242 | { 243 | m_ptrs.push_back(it); 244 | } 245 | }; 246 | 247 | typedef uint32_t trie_offset_t; 248 | 249 | template 250 | struct ValueHolder 251 | { 252 | private: 253 | std::unique_ptr value; 254 | public: 255 | typedef ValueT value_type; /* Effective type */ 256 | 257 | value_type & get_value() { return *value; }; 258 | const value_type & get_value() const { return *value; }; 259 | 260 | bool has_value() const noexcept { return value.get() != nullptr; }; 261 | void set_value(const ValueT & x) { value.reset(new ValueT(x)); }; 262 | void clr_value() { set_value(nullptr); }; 263 | 264 | void swap_value(ValueHolder & other) { std::swap(this->value, other.value); }; 265 | }; 266 | 267 | template <> 268 | struct ValueHolder 269 | { 270 | private: 271 | int count = 0; 272 | public: 273 | typedef int value_type; /* Effective type */ 274 | 275 | value_type & get_value() noexcept { return count; }; 276 | const value_type & get_value() const noexcept { return count; }; 277 | 278 | bool has_value() const noexcept { return count != 0; }; 279 | void set_value(const value_type & x) { count = x; }; 280 | void clr_value() { count = 0; }; 281 | 282 | void swap_value(ValueHolder & other) { std::swap(count, other.count); }; 283 | }; 284 | 285 | 286 | template 287 | struct PrefixHolder : public ValueHolder 288 | { 289 | private: 290 | typedef PrefixHolder self_type; 291 | typedef std::vector ChunkT; 292 | 293 | ChunkT * chunk; 294 | trie_offset_t begin, end; 295 | public: 296 | typedef const AtomT * key_iterator; 297 | 298 | bool starts_with(AtomT x) const { return (*chunk)[begin] == x; }; 299 | key_iterator kbegin() const { return std::addressof((*chunk)[begin]); }; 300 | key_iterator kend() const { return std::addressof((*chunk)[end]); }; 301 | 302 | ChunkT * insertion_hint() { return chunk; } 303 | 304 | void setkey(ChunkT * achunk, trie_offset_t k, trie_offset_t kend) 305 | { 306 | chunk = achunk; 307 | begin = k; 308 | end = kend; 309 | } 310 | 311 | void psplit(self_type * next, int breakIdx) 312 | { 313 | next->chunk = chunk; 314 | next->begin = this->begin + breakIdx; 315 | next->end = this->end; 316 | this->end = next->begin; 317 | } 318 | }; 319 | 320 | template 321 | struct PrefixHolder : public ValueHolder 322 | { 323 | private: 324 | typedef PrefixHolder self_type; 325 | typedef std::vector ChunkT; 326 | 327 | AtomT * prefix; 328 | size_t prefix_len; 329 | ValueT * value = nullptr; 330 | public: 331 | typedef const AtomT * key_iterator; 332 | 333 | bool starts_with(AtomT x) const { return *prefix == x; }; 334 | key_iterator kbegin() const { return prefix; }; 335 | key_iterator kend() const { return prefix + prefix_len; }; 336 | 337 | std::nullptr_t insertion_hint() { return nullptr; } 338 | 339 | void setkey(ChunkT * achunk, trie_offset_t k, trie_offset_t kend) 340 | { 341 | prefix = std::addressof(achunk->begin()[k]); 342 | prefix_len = kend - k; 343 | } 344 | 345 | void psplit(self_type * next, int breakIdx) 346 | { 347 | next->prefix = this->prefix + breakIdx; 348 | next->prefix_len = this->prefix_len - breakIdx; 349 | this->prefix_len -= next->prefix_len; 350 | } 351 | }; 352 | 353 | template 355 | struct TrieNodeSelector 356 | { 357 | /* No default implementation. 358 | * Must be specialized by code, which tries to use it. */ 359 | }; 360 | 361 | template 362 | struct TrieNodeSelector::value>::type> 364 | { 365 | typedef PrefixHolder PrefixHolderType; 366 | typedef TrieNode type; 367 | }; 368 | 369 | }; 370 | 371 | template ::type > 373 | struct trie_map 374 | { 375 | private: 376 | typedef NodeImpl NodeT; 377 | typedef detail::TrieIteratorInternal IteratorInternalT; 378 | 379 | /* We use deque in order to make edge storage stable */ 380 | typedef std::deque< NodeT > EdgeStorageT; 381 | public: 382 | typedef typename NodeImpl::value_type value_type; 383 | typedef typename IteratorInternalT::key_type key_type; 384 | typedef typename NodeImpl::key_iterator key_iterator; 385 | 386 | typedef value_type mapped_type; /* Defined for the compatibility with map */ 387 | private: 388 | typedef std::vector ChunkT; 389 | typedef std::deque InternalStorageT; 390 | 391 | /* The number of elements */ 392 | size_t msize = 0; 393 | InternalStorageT keys; 394 | 395 | template 396 | void insert_infix(KeyIterator it, KeyIterator end, NodeT * parent, NodeT * n) 397 | { 398 | size_t ksize = std::distance(it, end); 399 | ChunkT * target = parent == nullptr ? nullptr : parent->insertion_hint(); 400 | 401 | if ((target == nullptr) or (target->size() + ksize) > CMinChunkSize) 402 | { 403 | if (CMinChunkSize == 0 or keys.empty() or 404 | (keys.back().size() + ksize) > CMinChunkSize) 405 | { 406 | keys.emplace_back(); 407 | keys.back().reserve(CMinChunkSize); 408 | } 409 | 410 | target = std::addressof(keys.back()); 411 | } 412 | 413 | /* WARNING : Here, we rely on the fact, that vector pointers always 414 | * remain stable if values inserted fit into reserved space, which 415 | * should work in practice, but std::vector specification 416 | * does not guarantee that. 417 | */ 418 | detail::trie_offset_t kidx = target->size(); 419 | target->insert(target->end(), it, end); 420 | detail::trie_offset_t kendidx = target->size(); 421 | 422 | n->setkey(target, kidx, kendidx); 423 | } 424 | 425 | EdgeStorageT edges; 426 | 427 | NodeT * root() { return std::addressof(*edges.begin()); } 428 | 429 | NodeT * new_edge(int hint) 430 | { 431 | edges.emplace_back(hint); 432 | return std::addressof(edges.back()); 433 | } 434 | 435 | template 436 | NodeT * insert_edge(NodeT * parent, KeyIterator it, KeyIterator end, const value_type & value) 437 | { 438 | NodeT * n = new_edge(0); 439 | insert_infix(it, end, parent, n); 440 | if (parent != nullptr) { parent->put(n); } 441 | n->set_value(value); 442 | return n; 443 | } 444 | 445 | template 446 | void insert_value(NodeT & at, const value_type & value, const ReplacePolicy & replace) 447 | { 448 | if (at.has_value()) { 449 | replace(at.get_value(), value); 450 | } else { 451 | at.set_value(value); 452 | } 453 | } 454 | 455 | typedef std::shared_ptr IteratorPtr; 456 | 457 | public: 458 | 459 | /** @brief The iterator with very unfair behaviour 460 | * 461 | * There is no const iterator counterpart, because there is no real 462 | * benefit in making this iterator const. 463 | * 464 | * @warning: This operator is not copiable in regular sense. 465 | * In order to get a fixed copy of the iterator, 466 | * use explicit clone() method! 467 | */ 468 | struct iterator : public std::forward_iterator_tag 469 | { 470 | friend class trie_map; 471 | 472 | private: 473 | IteratorPtr _impl; 474 | 475 | iterator() : _impl() { }; 476 | 477 | void normalize() { 478 | if (!_impl->get()->has_value()) { ++(*this); } 479 | } 480 | 481 | explicit iterator(IteratorPtr a_impl) 482 | : _impl(a_impl) { normalize(); } 483 | 484 | explicit iterator(IteratorInternalT * a_impl) 485 | : _impl(a_impl) { normalize(); } 486 | 487 | public: 488 | value_type & value() { 489 | return _impl->get_value(); 490 | } 491 | 492 | std::basic_string key() { 493 | return _impl->get_key_str(); 494 | } 495 | 496 | value_type & operator *() { return value(); } 497 | 498 | /* 499 | * There is only one increment operator, since the postfix one 500 | * is very heavy if implemented correctly. 501 | */ 502 | iterator & operator ++() 503 | { 504 | if (_impl.get() != nullptr) { 505 | if (!_impl->next_value()) { _impl.reset(); } 506 | } 507 | 508 | return *this; 509 | } 510 | 511 | /** 512 | * Returns a "real" copy of the iterator, may be a heavy operation. 513 | */ 514 | iterator clone() { 515 | return (_impl.get() == nullptr) ? 516 | iterator() : iterator(new IteratorInternalT(*_impl)); 517 | } 518 | 519 | bool operator == (const iterator & other) const 520 | { 521 | return _impl.get() == other._impl.get() 522 | || (_impl.get() && other._impl.get() 523 | && _impl->get() == other._impl->get()); 524 | } 525 | 526 | bool operator != (const iterator & other) const 527 | { 528 | return not (*this == other); 529 | } 530 | }; 531 | 532 | private: 533 | typedef typename NodeT::map_iterator NodeItr; 534 | 535 | /** 536 | * Generalized lookup algorithm. 537 | */ 538 | template 539 | inline void general_search 540 | ( 541 | NodeT * n, 542 | KeyIterator it, 543 | KeyIterator end, 544 | A exactMatchAction, 545 | B noNextEdgeAction, 546 | C endInTheMiddleAction, 547 | D splitInTheMiddleAction, 548 | E edgeAction 549 | ) 550 | { 551 | key_iterator kbegin = n->kbegin(); 552 | 553 | while (n != nullptr) 554 | { 555 | key_iterator kend = n->kend(); 556 | key_iterator k = kbegin; 557 | 558 | while ((it != end) and (k != kend) and (*k == *it)) 559 | { ++k; ++it; } 560 | 561 | if (it == end) 562 | { 563 | if (k == kend) { 564 | exactMatchAction(n); 565 | } else { 566 | endInTheMiddleAction(n, k); 567 | } 568 | 569 | return; 570 | } 571 | else if (k != kend) 572 | { 573 | splitInTheMiddleAction(n, k, it); 574 | return; 575 | } 576 | 577 | NodeItr next_edge = n->find(*it); 578 | 579 | if (next_edge == n->nf()) 580 | { 581 | noNextEdgeAction(n, it); 582 | return; 583 | } 584 | 585 | edgeAction(next_edge, it); 586 | 587 | n = NodeT::value(next_edge); 588 | kbegin = n->kbegin() + 1; 589 | ++it; /* Already found the first character */ 590 | } 591 | } 592 | 593 | public: 594 | template 595 | void insert(KeyIterator it, KeyIterator end, const value_type & value, 596 | const ReplacePolicy & replace) 597 | { 598 | if (edges.empty()) 599 | { 600 | insert_edge(nullptr, it, end, value); 601 | ++msize; 602 | return; 603 | } 604 | 605 | general_search(root(), it, end, 606 | [this, &value, &replace] (NodeT * n) { 607 | insert_value(*n, value, replace); 608 | }, 609 | 610 | [this, &value, end] (NodeT * n, KeyIterator kit) { 611 | insert_edge(n, kit, end, value); 612 | ++msize; 613 | }, 614 | 615 | [this, &value] (NodeT * n, key_iterator eit) { 616 | n->split(new_edge(1), eit - n->kbegin()); 617 | n->set_value(value); 618 | ++msize; 619 | }, 620 | 621 | [this, &value, end] (NodeT * n, key_iterator eit, KeyIterator kit) { 622 | n->split(new_edge(2), eit - n->kbegin()); 623 | insert_edge(n, kit, end, value); 624 | ++msize; 625 | }, 626 | 627 | [] (NodeItr x, KeyIterator) { (void)x; } 628 | ); 629 | } 630 | 631 | size_t size() const noexcept { return msize; } 632 | 633 | template 634 | void add(KeyIterator it, KeyIterator end, const value_type & value) { 635 | return insert(it, end, value, 636 | [] (value_type & old, const value_type & n) { old += n; } ); 637 | } 638 | 639 | template 640 | void insert(KeyIterator it, KeyIterator end, const value_type & value) { 641 | return insert(it, end, value, 642 | [] (value_type & old, const value_type & n) { old = n; }); 643 | } 644 | 645 | template 646 | void insert(const std::basic_string & str, const value_type & value, 647 | const ReplacePolicy & replace) 648 | { 649 | return insert(str.begin(), str.end(), value, replace); 650 | } 651 | 652 | void add(const std::basic_string & str, const value_type & value) { 653 | return add(str.begin(), str.end(), value); 654 | } 655 | 656 | void insert(const std::basic_string & str, const value_type & value) { 657 | return insert(str.begin(), str.end(), value); 658 | } 659 | 660 | private: 661 | template::value>::type> 663 | struct SetSpecific { }; 664 | 665 | public: 666 | 667 | template > 668 | void insert(KeyIterator it, KeyIterator end) { 669 | return insert(it, end, 1); 670 | } 671 | 672 | template > 673 | void add(KeyIterator it, KeyIterator end) { 674 | return add(it, end, 1); 675 | } 676 | 677 | template > 678 | void insert(const std::basic_string & str) { 679 | return insert(str.begin(), str.end(), 1); 680 | } 681 | 682 | template > 683 | void add(const std::basic_string & str) { 684 | return add(str.begin(), str.end(), 1); 685 | } 686 | 687 | template 688 | bool contains(KeyIterator it, KeyIterator end) 689 | { 690 | if (edges.empty()) { return false; } 691 | 692 | bool result = false; 693 | 694 | general_search(root(), it, end, 695 | [&result] (NodeT * n) { 696 | if (n->has_value()) { result = true; } }, 697 | 698 | [] (NodeT * , KeyIterator) { }, 699 | [] (NodeT * , key_iterator ) { }, 700 | [] (NodeT * , key_iterator , KeyIterator ) { }, 701 | [] (NodeItr, KeyIterator) { } 702 | ); 703 | 704 | return result; 705 | } 706 | 707 | bool contains(const std::basic_string & str) 708 | { 709 | return contains(str.begin(), str.end()); 710 | } 711 | 712 | template 713 | bool containsPrefixOf(KeyIterator it, KeyIterator end) 714 | { 715 | if (edges.empty()) { return false; } 716 | 717 | bool result = false; 718 | 719 | general_search(root(), it, end, 720 | [&result] (NodeT * n) { result = true; }, 721 | [&result] (NodeT * , KeyIterator) { result = true; }, 722 | [] (NodeT * , key_iterator ) { }, 723 | [] (NodeT * , key_iterator , KeyIterator ) { }, 724 | [] (NodeItr, KeyIterator) { } 725 | ); 726 | 727 | return result; 728 | } 729 | 730 | bool containsPrefixOf(const std::basic_string & str) 731 | { 732 | return containsPrefixOf(str.begin(), str.end()); 733 | } 734 | 735 | private: 736 | template 737 | iterator find_prefix_int(NodeT * root_node, KeyIterator it, KeyIterator kend, CallbackType exactMatch) 738 | { 739 | iterator output; 740 | KeyIterator inputEnd; 741 | 742 | general_search(root_node, it, kend, 743 | /* Exact Match */ 744 | [&exactMatch, &output] (NodeT * n) { 745 | if (n->has_value()) { exactMatch(); } 746 | output._impl.reset(new IteratorInternalT(n)); 747 | }, 748 | 749 | [] (NodeT *, KeyIterator) { }, 750 | 751 | [&output] (NodeT * n, key_iterator) { 752 | output._impl.reset(new IteratorInternalT(n)); 753 | }, 754 | 755 | [] (NodeT *, key_iterator, KeyIterator) { }, 756 | 757 | [&inputEnd] (NodeItr, KeyIterator iend) { 758 | inputEnd = iend; } 759 | ); 760 | 761 | if (output._impl.get() != nullptr) 762 | { 763 | output.normalize(); 764 | std::copy(it, inputEnd, std::back_inserter(output._impl->base_prefix)); 765 | } 766 | 767 | return output; 768 | } 769 | 770 | template 771 | iterator find_prefix_int(NodeT * root_node, KeyIterator it, KeyIterator kend, bool & exactMatch) 772 | { 773 | exactMatch = false; 774 | return find_prefix_int(root_node, it, kend, [&exactMatch] () { exactMatch = true; }); 775 | } 776 | 777 | template 778 | iterator find_prefix_int(NodeT * root_node, KeyIterator it, KeyIterator kend, std::nullptr_t) 779 | { 780 | return find_prefix_int(root_node, it, kend, [] () {}); 781 | } 782 | 783 | public: 784 | template 785 | iterator find_prefix(KeyIterator it, KeyIterator kend, CallbackType exactMatch) 786 | { 787 | if (edges.empty()) { return end(); } 788 | return find_prefix_int(root(), it, kend, exactMatch); 789 | } 790 | 791 | /* NOTE : this specialization is needed to catch bool as reference, not as value */ 792 | template 793 | iterator find_prefix(KeyIterator it, KeyIterator kend, bool & exactMatch) 794 | { 795 | if (edges.empty()) { return end(); } 796 | return find_prefix_int(root(), it, kend, exactMatch); 797 | } 798 | 799 | template 800 | iterator find_prefix(const iterator & base, KeyIterator it, KeyIterator kend, CallbackType exactMatch) 801 | { 802 | if (base._impl.get() == nullptr || base._impl->get() == nullptr) { 803 | return end(); 804 | } 805 | 806 | return find_prefix_int(base._impl->get(), it, kend, exactMatch); 807 | } 808 | 809 | template 810 | iterator find_prefix(const std::basic_string & str, CallbackType exactMatch) { 811 | return find_prefix(str.begin(), str.end(), exactMatch); 812 | } 813 | 814 | /* NOTE : this "specialization" (overload actually) is needed to catch 815 | * bool as reference, not as value */ 816 | iterator find_prefix(const std::basic_string & str, bool & exactMatch) { 817 | return find_prefix(str.begin(), str.end(), exactMatch); 818 | } 819 | 820 | iterator find_prefix(const std::basic_string & str) { 821 | return find_prefix(str.begin(), str.end(), [] () {}); 822 | } 823 | 824 | template 825 | iterator find(KeyIterator it, KeyIterator kend) 826 | { 827 | if (edges.empty()) { return end(); } 828 | 829 | IteratorInternalT * root_it = new IteratorInternalT(root()); 830 | IteratorPtr output(root_it); 831 | 832 | general_search(root(), it, kend, 833 | [&output] (NodeT * n) { 834 | if (!n->has_value()) { 835 | output.reset(); } }, 836 | 837 | [&output] (NodeT * n, KeyIterator ) { output.reset(); }, 838 | [&output] (NodeT * n, key_iterator ) { output.reset(); }, 839 | [&output] (NodeT * n, key_iterator, KeyIterator ) 840 | { output.reset(); }, 841 | 842 | [&output] (NodeItr x, KeyIterator) { output->push(x); } 843 | ); 844 | 845 | return (output == nullptr) ? iterator() : iterator(output); 846 | } 847 | 848 | iterator find(const std::basic_string & str) 849 | { 850 | return find(str.begin(), str.end()); 851 | } 852 | 853 | iterator begin() { 854 | return edges.empty() ? end() : 855 | iterator(IteratorPtr(new IteratorInternalT(root()))); } 856 | 857 | iterator end() { return iterator(); } 858 | 859 | template 860 | value_type * get(KeyIterator it, KeyIterator end) 861 | { 862 | if (edges.empty()) { return nullptr; } 863 | 864 | value_type * result = nullptr; 865 | 866 | general_search(root(), it, end, 867 | [&result] (NodeT * n) { 868 | if (n->has_value()) { 869 | result = std::addressof(n->get_value()); } 870 | }, 871 | 872 | [] (NodeT * , KeyIterator) { }, 873 | [] (NodeT * , key_iterator ) { }, 874 | [] (NodeT * , key_iterator , KeyIterator ) { }, 875 | [] (NodeItr, KeyIterator) { } 876 | ); 877 | 878 | return result; 879 | } 880 | 881 | value_type * get(const std::basic_string & str) 882 | { 883 | return get(str.begin(), str.end()); 884 | } 885 | 886 | template 887 | value_type & at(KeyIterator it, KeyIterator end) 888 | { 889 | value_type * result = get(it, end); 890 | 891 | if (result == nullptr) { 892 | throw std::out_of_range("trie::at"); 893 | } 894 | 895 | return *result; 896 | } 897 | 898 | value_type & at(const std::basic_string & str) 899 | { 900 | return at(str.begin(), str.end()); 901 | } 902 | 903 | value_type & operator [](const std::basic_string & str) 904 | { 905 | return at(str.begin(), str.end()); 906 | } 907 | 908 | size_t _edges() { return edges.size(); } 909 | size_t _keys() { return keys.size(); } 910 | 911 | struct _debug_print 912 | { 913 | const trie_map & map; 914 | 915 | _debug_print(const trie_map & amap) : map(amap) {}; 916 | 917 | std::ostream & operator ()(std::ostream & stream) const 918 | { 919 | if (map.edges.empty()) 920 | { 921 | return stream << "[ empty ]"; 922 | } 923 | 924 | trie_map::IteratorInternalT it(std::addressof(map.edges[0])); 925 | 926 | while (it.m_root != nullptr) 927 | { 928 | const trie_map::NodeT * n = it.get(0); 929 | 930 | std::copy(n->kbegin(), n->kend(), 931 | std::ostream_iterator(stream)); 932 | 933 | if (n->has_value()) 934 | { 935 | stream << "(=" << n->get_value() << ")"; 936 | } 937 | 938 | if (it.step_down()) { stream << "{"; continue; } 939 | if (it.step_fore()) { stream << "}{"; continue; } 940 | 941 | while (!it.m_ptrs.empty()) 942 | { 943 | stream << "}"; 944 | if (it.step_up()) { break; } 945 | } 946 | 947 | if (it.m_ptrs.empty()) { it.m_root = nullptr; } 948 | } 949 | 950 | return stream; 951 | } 952 | 953 | friend std::ostream & operator << (std::ostream & stream, 954 | const _debug_print & x) 955 | { 956 | return x(stream); 957 | } 958 | }; 959 | }; 960 | 961 | /** 962 | * @warning: operator== ALWAYS returns \true if 963 | * the left operand dereferences to \0. 964 | */ 965 | template 966 | struct CStrIterator : std::forward_iterator_tag 967 | { 968 | private: 969 | AtomT * m_str; 970 | typedef CStrIterator self_type; 971 | public: 972 | CStrIterator(AtomT * a_str) : m_str(a_str) {} 973 | explicit CStrIterator(AtomT * a_str, size_t offset) : m_str(a_str + offset) {} 974 | 975 | self_type operator ++() { return m_str++; } 976 | AtomT & operator *() { return *m_str; } 977 | 978 | bool operator ==(const self_type & other) const 979 | { 980 | return m_str == other.m_str || *m_str == '\0'; 981 | } 982 | }; 983 | 984 | }; 985 | --------------------------------------------------------------------------------