├── CMakeLists.txt ├── LICENSE ├── README.md ├── include └── taihen │ ├── lexer.h │ └── parser.h ├── src ├── CMakeLists.txt ├── lexer.c └── parser.c └── test ├── CMakeLists.txt ├── test_lexer.cpp └── test_parser.cpp /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.1.0) 2 | 3 | project(taihen-parser) 4 | 5 | option(TEST "build and perform tests" OFF) 6 | option(INSTALL_ENABLED "if enabled add files to cmake's install()" ON) 7 | option(USE_INBUILT_CTYPE "use internal ctype over system" OFF) 8 | option(USE_INBUILT_STRING "use internal string over system" OFF) 9 | 10 | include_directories(include) 11 | 12 | add_subdirectory(src) 13 | 14 | if (${TEST}) 15 | add_subdirectory(test) 16 | 17 | enable_testing() 18 | add_test(NAME LexerTest COMMAND test-lexer) 19 | endif() 20 | 21 | if (${INSTALL_ENABLED}) 22 | install(DIRECTORY include/taihen/ 23 | DESTINATION include/taihen 24 | FILES_MATCHING PATTERN "*.h") 25 | endif() 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 David "Davee" Morgan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # taihen-parser - _taiHEN's configuration parser_ 2 | 3 | taiHEN is a custom firmware (CFW) framework for PS Vita™ and implements a configuration to help packagers and users control which modules or plugins are loaded and when. 4 | taihen-parser provides a convenient C API for interacting with these configuration files to help developers write supporting tools for taiHEN. taihen-parser provides both a lexer and parser API for configuration files. 5 | 6 | The problem with CFW of a previous era was that it was one person's imagination of a custom firmware. Average developer X could not easily replace the in-game menu within the provided CFW. Likewise, average user Y could not strip out features they did not like. Here, taiHEN provides a solution by providing a configuration file where CFW can be defined as a set of modules and plugins. 7 | 8 | Person X may like live-area mod1 and person Y may like live-area mod2. No longer do these two need to chose between CFW A and CFW B that implements mod1 and mod2, respectively. Instead, they can modify this configuration from their favourite CFW to use whichever mod they prefer. This architecture promotes the _custom_ in _custom firmware_ by encouraging developers to move away from huge monolithic CFW of the past and help nurture an open, compatible and _user orientated_ custom firmware experience. 9 | 10 | ## Configuration Format 11 | taiHEN employs a text based format for configuring automatic loading of modules. The configuration format is a UTF-8 text file that utilises line seperation to ease parsing and human readability. Each line must be exclusive to one of four types: 12 | - An empty line 13 | - A comment 14 | - Section 15 | - Module path 16 | 17 | Each line can be at most ```CONFIG_MAX_LINE_LENGTH``` characters wide, and trailing/leading whitespace is permitted. 18 | 19 | ## Lexer Tokens 20 | The config lexer produces the following tokens: 21 | - ```CONFIG_START_TOKEN``` 22 | - ```CONFIG_END_TOKEN``` 23 | - ```CONFIG_COMMENT_TOKEN``` 24 | - ```CONFIG_SECTION_TOKEN``` 25 | - ```CONFIG_SECTION_HALT_TOKEN``` 26 | - ```CONFIG_SECTION_NAME_TOKEN``` 27 | - ```CONFIG_PATH_TOKEN``` 28 | 29 | A valid configuration format should obey the grammar: 30 | ``` 31 | config ::= CONFIG_START_TOKEN (CONFIG_COMMENT_TOKEN | section)* CONFIG_END_TOKEN 32 | section ::= CONFIG_SECTION_TOKEN CONFIG_SECTION_HALT_TOKEN? CONFIG_SECTION_NAME_TOKEN ('\n' | EOF) path* 33 | path ::= CONFIG_PATH_TOKEN ('\n' | EOF) 34 | ``` 35 | 36 | ## Sections: ```*``` 37 | A section in the configuration file functions as a filter and controller for CFW module loading. 38 | Each section begins with a ```*``` and can optionally be followed with a ```!``` to mark the section as a halt point (see further below). After these tokens, the rest of the line a UTF-8 name for the section. 39 | 40 | A section of the same name may appear in the file multiple times. This functionality is intended to allow users to take advantage of taiHEN's load ordering policy. 41 | 42 | ### Halt point: ```!``` 43 | A section can optionally have the halt point token ```!``` following the section token ```*``` in the configuration file. This token instructs the parser to stop further parsing of the file if the section name is within context. See the examples below for a visual worked case on this feature. 44 | 45 | ### Reserved names 46 | There are currently two reserved names for sections: 47 | - ```ALL``` - A catch all user-mode section that will load the modules it contains for every user-mode process. 48 | - ```KERNEL``` - A section that loads resident kernel modules on the start of taiHEN. 49 | 50 | Using the halt point ```!``` on these sections results in undefined behaviour. 51 | 52 | ## API 53 | This API currently offers no guarantee of stability. Please remember that it may change drastically upon any future versions of taiHEN. 54 | taiHEN's configuration parser exposes it's lexer algorithm to assist in development of supporting tools. Please consult the header files for documentation. 55 | 56 | ## Example Configurations 57 | 58 | Below is an example of a very simple configuration: 59 | ``` 60 | # example simple config 61 | *ALL 62 | ux0:/plugins/my_plugin.suprx 63 | ux0:/plugins/my_plugin2.suprx 64 | ``` 65 | 66 | This example consists of a single section ```ALL```. Which means that every game/application/homebrew that is launched will have both ```my_plugin.suprx``` and ```my_plugin2.suprx``` loaded in that process space and in order. 67 | 68 | More precise functionality may be required for certain homebrew. Perhaps you wish to package your own CFW, in which case you may create a complex configuration as shown below: 69 | ``` 70 | # hello this is a comment. this line is ignored 71 | # this line also 72 | # this too, whitespace at the start of a line is OK 73 | *COOL_GAME 74 | # i'm within a section, woo! 75 | ux0:/coolgame/plugin.suprx 76 | # indentation is ok with me 77 | ux0:/coolgame/plugin2.suprx 78 | # spaces within path is ok 79 | ux0:/really cool/I haVe spaces and caps/plugin3.suprx 80 | # next section 81 | *ALL 82 | # i'm a special section! 83 | # i'm always included... usually 84 | ux0:/plugins/ingamemusic.suprx 85 | *KERNEL 86 | # i'm a special section also! 87 | # my plugins are loaded to kernel memory as resident modules 88 | ux0:/taihen/henkaku.skprx 89 | ux0:/psphacks.skprx 90 | *COOL_GAME 91 | # this section again?! this is ok! this is a way packagers 92 | # can take advantage of load order. 93 | ux0:/coolgame/idependoningamemusic.suprx 94 | *!COOL_GAME2 95 | # what is the '!' for? 96 | # the '!' prevents further parsing 97 | # this would make more sense to put at the start if you want to 98 | # blacklist certain modules 99 | # look, nothing to load! 100 | *ALL 101 | ux0:/plugins/ibreak_coolgame2.suprx 102 | 103 | # emojis? 104 | ux0:/🤔/🦄/👻/🎃.suprx 105 | ``` 106 | Much more complex, but really I expect even more complexity when real CFW components come around. As mentioned previously, parsing occurs from top to bottom, identical to load order. When parsing, a section context is selected. In the case of taiHEN, this context is a title id such as ```MLCL00001``` for our molecularShell homebrew. In this case, lets assume for ease that we have selected ```COOL_GAME``` and it is a user-mode process. 107 | 108 | Comments are ignored, so lets continue until we reach the first section: ```COOL_GAME```. Since our selected section matches this first section, the paths below are loaded until a new section is reached. 109 | - ```ux0:/coolgame/plugin.suprx``` 110 | - ```ux0:/coolgame/plugin2.suprx``` 111 | - ```ux0:/really cool/I haVe spaces and caps/plugin3.suprx``` 112 | 113 | Then we reach a new section ```ALL```. As mentioned above, ```ALL``` is a special reserved section name that matches every user-mode process. So our loaded module list grows: 114 | - ```ux0:/coolgame/plugin.suprx``` 115 | - ```ux0:/coolgame/plugin2.suprx``` 116 | - ```ux0:/really cool/I haVe spaces and caps/plugin3.suprx``` 117 | - ```ux0:/plugins/ingamemusic.suprx``` 118 | 119 | Next section we reach is the special section ```KERNEL```. This is not processed within our context we so continue until we reach the next section: ```COOL_GAME```. We have already had this section before, but we multiple sections are allowed to take advantage of taiHEN's module load ordering. This is extremely useful when you have dependencies between plugins/modules that need resolved. In this example we have ```idependoningamemusic.suprx``` which must be loaded after ```ingamemusic.suprx```. 120 | 121 | Our load list now looks like: 122 | - ```ux0:/coolgame/plugin.suprx``` 123 | - ```ux0:/coolgame/plugin2.suprx``` 124 | - ```ux0:/really cool/I haVe spaces and caps/plugin3.suprx``` 125 | - ```ux0:/plugins/ingamemusic.suprx``` 126 | - ```ux0:/coolgame/idependoningamemusic.suprx``` 127 | 128 | Next section is ```COOL_GAME2``` which does not match our section context. This section has a halt point ```!``` but we ignore it in this case because we do not much it. 129 | 130 | Lastly, we have the final section ```ALL``` again, which completes our load list: 131 | - ```ux0:/coolgame/plugin.suprx``` 132 | - ```ux0:/coolgame/plugin2.suprx``` 133 | - ```ux0:/really cool/I haVe spaces and caps/plugin3.suprx``` 134 | - ```ux0:/plugins/ingamemusic.suprx``` 135 | - ```ux0:/coolgame/idependoningamemusic.suprx``` 136 | - ```ux0:/plugins/ibreak_coolgame2.suprx``` 137 | - ```ux0:/🤔/🦄/👻/🎃.suprx``` 138 | 139 | NOTE: I don't know conclusively if the Vita filesystem supports emojis. Don't use them... 140 | 141 | ### ```COOL_GAME2``` Halt Point Example 142 | Following the same logic as above, we will walk through the configuration as ```COOL_GAME2``` context. 143 | 144 | First section is ```COOL_GAME```, not a match so we skip it. 145 | 146 | Second section is ```ALL```, so we load modules from it: 147 | - ```ux0:/plugins/ingamemusic.suprx``` 148 | 149 | Third section is ```KERNEL```, so we skip it. 150 | 151 | Fourth section is ```COOL_GAME``` again so we skip it. 152 | 153 | Fifth section is ```COOL_GAME2``` so we process it. This time we have a halt point so this will be the last section we process. Remember, the halt point ```!``` stops any further parsing. This section however has no modules, so nothing is loaded. A section with no modules is OK. In this case, the following ```ALL``` section breaks ```COOL_GAME2``` in our hypothetical world. By using the halt point correctly, a CFW packager can maximise compatibility whilst maintaining load ordering. 154 | 155 | Our final module loading list for ```COOL_GAME2```: 156 | - ```ux0:/plugins/ingamemusic.suprx``` 157 | 158 | # Building 159 | To build taihen-parser, you require CMake to generate the appropriate build scripts. 160 | From within the repository directory: 161 | ```sh 162 | $ mkdir build && cd build 163 | $ cmake .. 164 | $ make 165 | ``` 166 | 167 | To build the included tests you require the boost ```unit_test_framework``` installed. Then instead use: 168 | ```sh 169 | $ mkdir build && cd build 170 | $ cmake -DTEST=ON .. 171 | $ make 172 | ``` 173 | 174 | # Installation 175 | To install to a specified location define ```CMAKE_INSTALL_PREFIX```: 176 | ```sh 177 | $ mkdir build && cd build 178 | $ cmake -DCMAKE_INSTALL_PREFIX=/my/install/location .. 179 | $ make 180 | $ make install 181 | ``` 182 | 183 | # Acknowledgements 184 | Team molecule for HENkaku, Yifan Lu for taiHEN and xyz for immense support of the vitasdk. 185 | 186 | ## License 187 | taihen-parser is licensed under the terms of the MIT license which can be read in the ```LICENSE``` file in the root of the repository. 188 | (C) 2016 David "Davee" Morgan 189 | -------------------------------------------------------------------------------- /include/taihen/lexer.h: -------------------------------------------------------------------------------- 1 | #ifndef LEXER_H 2 | #define LEXER_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #define CONFIG_MAX_LINE_LENGTH (256) 9 | 10 | typedef enum 11 | { 12 | CONFIG_START_TOKEN, 13 | CONFIG_END_TOKEN, 14 | CONFIG_COMMENT_TOKEN, 15 | CONFIG_SECTION_TOKEN, 16 | CONFIG_SECTION_HALT_TOKEN, 17 | CONFIG_SECTION_NAME_TOKEN, 18 | CONFIG_PATH_TOKEN 19 | } taihen_config_lexer_token; 20 | 21 | typedef struct 22 | { 23 | const char *input; 24 | const char *end; 25 | taihen_config_lexer_token token; 26 | char line[CONFIG_MAX_LINE_LENGTH]; 27 | char *line_pos; 28 | } taihen_config_lexer; 29 | 30 | int taihen_config_init_lexer(taihen_config_lexer *ctx, const char *input); 31 | int taihen_config_lex(taihen_config_lexer *ctx); 32 | 33 | #ifdef __cplusplus 34 | } 35 | #endif 36 | #endif // LEXER_H 37 | -------------------------------------------------------------------------------- /include/taihen/parser.h: -------------------------------------------------------------------------------- 1 | #ifndef PARSER_H 2 | #define PARSER_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | typedef void (* taihen_config_handler)(const char *module, void *param); 9 | 10 | int taihen_config_validate(const char *input); 11 | void taihen_config_parse(const char *input, const char *section, taihen_config_handler handler, void *param); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | #endif // PARSER_H 17 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(CheckIncludeFile) 2 | 3 | if (MSVC) 4 | add_definitions(-Dinline=__inline) 5 | endif() 6 | 7 | check_include_file(ctype.h HAVE_CTYPE) 8 | check_include_file(string.h HAVE_STRING) 9 | 10 | if ((NOT ${HAVE_CTYPE}) OR ${USE_INBUILT_CTYPE}) 11 | add_definitions(-DNO_CTYPE) 12 | endif() 13 | 14 | if ((NOT ${HAVE_STRING}) OR ${USE_INBUILT_STRING}) 15 | add_definitions(-DNO_STRING) 16 | endif() 17 | 18 | add_definitions(-Os -ffunction-sections -fdata-sections) 19 | 20 | add_library(taihenparser lexer.c parser.c) 21 | 22 | if (${INSTALL_ENABLED}) 23 | install(TARGETS taihenparser 24 | RUNTIME DESTINATION bin 25 | LIBRARY DESTINATION lib 26 | ARCHIVE DESTINATION lib) 27 | endif() 28 | -------------------------------------------------------------------------------- /src/lexer.c: -------------------------------------------------------------------------------- 1 | /* 2 | * lexer.c - tokenisation algorithm for taihen configuration files 3 | * 4 | * Copyright (C) 2016 David "Davee" Morgan 5 | * 6 | * This software may be modified and distributed under the terms 7 | * of the MIT license. See the LICENSE file for details. 8 | */ 9 | 10 | #include 11 | 12 | #ifndef NO_STRING 13 | #include 14 | #endif // NO_STRING 15 | #ifndef NO_CTYPE 16 | #include 17 | #endif // NO_CTYPE 18 | 19 | static const char TOKEN_EMPTY = '\0'; 20 | static const char TOKEN_COMMENT_START = '#'; 21 | static const char TOKEN_SECTION_START = '*'; 22 | static const char TOKEN_HALT = '!'; 23 | 24 | #ifdef NO_CTYPE 25 | static int isspace(int c) 26 | { 27 | // we use "C" locale 28 | return (c == ' ') 29 | || (c == '\t') 30 | || (c == '\n') 31 | || (c == '\v') 32 | || (c == '\f') 33 | || (c == '\r'); 34 | } 35 | #endif // NO_CTYPE 36 | 37 | #ifdef NO_STRING 38 | #include 39 | 40 | static size_t strlen(const char *s) 41 | { 42 | size_t idx = 0; 43 | 44 | while (s[idx]) 45 | { 46 | ++idx; 47 | } 48 | 49 | return idx; 50 | } 51 | 52 | static void *memset(void *s, int c, size_t len) 53 | { 54 | unsigned char *p = (unsigned char *)s; 55 | 56 | while (len--) 57 | { 58 | *p++ = (unsigned char)c; 59 | } 60 | 61 | return s; 62 | } 63 | 64 | static void *memcpy(void *s1, const void * s2, size_t len) 65 | { 66 | char *dest = (char *)s1; 67 | const char *src = (const char *)s2; 68 | 69 | while (len--) 70 | { 71 | *dest++ = *src++; 72 | } 73 | 74 | return s1; 75 | } 76 | #endif // NO_STRING 77 | 78 | static char *skip_whitespace(char *input) 79 | { 80 | while (isspace((unsigned char)*input)) 81 | { 82 | ++input; 83 | } 84 | 85 | return input; 86 | } 87 | 88 | static void trim_whitespace(char *input) 89 | { 90 | char *end = input + strlen(input)-1; 91 | 92 | while (end > input) 93 | { 94 | if (!isspace((unsigned char)*end)) 95 | { 96 | break; 97 | } 98 | 99 | *end = '\0'; 100 | end--; 101 | } 102 | } 103 | 104 | static const char *get_newline(const char *input) 105 | { 106 | while (*input) 107 | { 108 | if (*input == '\r' || *input == '\n') 109 | { 110 | break; 111 | } 112 | 113 | ++input; 114 | } 115 | 116 | return input; 117 | } 118 | 119 | static int lex_line(taihen_config_lexer *ctx) 120 | { 121 | if (ctx->input >= ctx->end) 122 | { 123 | ctx->token = CONFIG_END_TOKEN; 124 | return 0; 125 | } 126 | 127 | const char *line_end = get_newline(ctx->input); 128 | size_t len = line_end - ctx->input; 129 | 130 | 131 | // check our line can fit in our buffer 132 | if (len >= CONFIG_MAX_LINE_LENGTH) 133 | { 134 | return -1; 135 | } 136 | 137 | // copy line to our buffer so we can modify it 138 | memcpy(ctx->line, ctx->input, len); 139 | ctx->line[len] = '\0'; 140 | ctx->line_pos = ctx->line; 141 | ctx->input = line_end+1; 142 | 143 | // remove leading whitespace 144 | ctx->line_pos = skip_whitespace(ctx->line_pos); 145 | 146 | // check for empty line or comment 147 | if (*ctx->line_pos == TOKEN_EMPTY || *ctx->line_pos == TOKEN_COMMENT_START) 148 | { 149 | ctx->token = CONFIG_COMMENT_TOKEN; 150 | return 1; 151 | } 152 | 153 | // remove any trailing whitespace 154 | trim_whitespace(ctx->line_pos); 155 | 156 | // check if our line is empty now 157 | if (*ctx->line_pos == TOKEN_EMPTY) 158 | { 159 | ctx->token = CONFIG_COMMENT_TOKEN; 160 | return 1; 161 | } 162 | 163 | // check for section start 164 | if (*ctx->line_pos == TOKEN_SECTION_START) 165 | { 166 | ctx->token = CONFIG_SECTION_TOKEN; 167 | } 168 | else 169 | { 170 | // should be a path 171 | ctx->token = CONFIG_PATH_TOKEN; 172 | } 173 | 174 | return 1; 175 | } 176 | 177 | static int lex_section_halt(taihen_config_lexer *ctx) 178 | { 179 | // skip more whitespace 180 | ctx->line_pos = skip_whitespace(ctx->line_pos+1); 181 | 182 | // check for halt token 183 | if (*ctx->line_pos == TOKEN_HALT) 184 | { 185 | ctx->token = CONFIG_SECTION_HALT_TOKEN; 186 | } 187 | else 188 | { 189 | // should be a name 190 | ctx->token = CONFIG_SECTION_NAME_TOKEN; 191 | } 192 | 193 | return 1; 194 | } 195 | 196 | static int lex_section_name(taihen_config_lexer *ctx) 197 | { 198 | // skip more whitespace 199 | ctx->line_pos = skip_whitespace(ctx->line_pos+1); 200 | 201 | // should be a name 202 | ctx->token = CONFIG_SECTION_NAME_TOKEN; 203 | return 1; 204 | } 205 | 206 | /*! 207 | \brief Initialise or reset lexer context. 208 | 209 | taihen_config_init_lexer will init/reset the provided taihen_config_lexer and assign the 210 | provided input to the context. 211 | 212 | \param ctx A non-null pointer to a context to initialise or reset. 213 | \param input A non-null UTF-8 encoded null-terminated string to tokenise. 214 | \return zero on success, < 0 on error. 215 | */ 216 | int taihen_config_init_lexer(taihen_config_lexer *ctx, const char *input) 217 | { 218 | if (ctx == NULL || input == NULL) 219 | { 220 | return -1; 221 | } 222 | 223 | // reset everything to default and reset input/end pointer 224 | memset(ctx, 0, sizeof(taihen_config_lexer)); 225 | ctx->token = CONFIG_START_TOKEN; 226 | ctx->input = input; 227 | ctx->end = input + strlen(input); 228 | return 0; 229 | } 230 | 231 | /*! 232 | \brief Retrieve the next lexer token. 233 | 234 | taihen_config_lex will accept an initialised context and provide the next token 235 | in the stream. This tokenisation does no checking on formatting and as such does not 236 | confirm that the document provided is well-formed. 237 | 238 | \param ctx A non-null point to an initialised context. 239 | \return 0 if there are no further tokens, > 0 if there are further tokens else < 0 on error. 240 | \sa taihen_config_init_lexer 241 | */ 242 | int taihen_config_lex(taihen_config_lexer *ctx) 243 | { 244 | if (ctx == NULL) 245 | { 246 | return -1; 247 | } 248 | 249 | switch (ctx->token) 250 | { 251 | case CONFIG_START_TOKEN: 252 | case CONFIG_COMMENT_TOKEN: 253 | case CONFIG_PATH_TOKEN: 254 | case CONFIG_SECTION_NAME_TOKEN: 255 | return lex_line(ctx); 256 | 257 | case CONFIG_SECTION_TOKEN: 258 | return lex_section_halt(ctx); 259 | 260 | case CONFIG_SECTION_HALT_TOKEN: 261 | return lex_section_name(ctx); 262 | 263 | case CONFIG_END_TOKEN: 264 | default: 265 | return -1; 266 | } 267 | } 268 | -------------------------------------------------------------------------------- /src/parser.c: -------------------------------------------------------------------------------- 1 | /* 2 | * parser.c - parser algorithm for taihen configuration files 3 | * 4 | * Copyright (C) 2016 David "Davee" Morgan 5 | * 6 | * This software may be modified and distributed under the terms 7 | * of the MIT license. See the LICENSE file for details. 8 | */ 9 | 10 | #include 11 | #include 12 | 13 | #ifndef NO_STRING 14 | #include 15 | #endif // NO_STRING 16 | 17 | static const char *TOKEN_ALL_SECTION = "ALL"; 18 | static const char *TOKEN_KERNEL_SECTION = "KERNEL"; 19 | 20 | #ifdef NO_STRING 21 | #include 22 | 23 | static size_t strlen(const char *s) 24 | { 25 | size_t idx = 0; 26 | 27 | while (s[idx]) 28 | { 29 | ++idx; 30 | } 31 | 32 | return idx; 33 | } 34 | 35 | static int strcmp(const char * s1, const char * s2) 36 | { 37 | while ((*s1) && (*s1 == *s2)) 38 | { 39 | ++s1; 40 | ++s2; 41 | } 42 | return (*s1 - *s2); 43 | } 44 | 45 | #endif // NO_STRING 46 | 47 | static inline int is_continuation_byte(char b) 48 | { 49 | return ((b & 0xC0) == 0x80); 50 | } 51 | 52 | static inline int check_continuation_bytes(const char *start, const char *end, int len) 53 | { 54 | if ((end - start) < len) 55 | { 56 | return 0; 57 | } 58 | 59 | for (int i = 0; i < len; ++i) 60 | { 61 | if (!is_continuation_byte(start[i])) 62 | { 63 | return 0; 64 | } 65 | } 66 | 67 | return 1; 68 | } 69 | 70 | static int check_utf8_sequence(const char *str, const char *end, unsigned char mask, unsigned char lead, int cont_len) 71 | { 72 | if ((*str & mask) == lead) 73 | { 74 | if (check_continuation_bytes(str+1, end, cont_len)) 75 | { 76 | return -1; 77 | } 78 | 79 | return 1; 80 | } 81 | 82 | return 0; 83 | } 84 | 85 | static int check_utf8(const char *str) 86 | { 87 | struct 88 | { 89 | unsigned char mask; 90 | unsigned char lead; 91 | unsigned char cont_len; 92 | } utf8_lut[4] = 93 | { 94 | { 0x80, 0x00, 0 }, // U+0000 -> U+007F, 0xxxxxx 95 | { 0xE0, 0xC0, 1 }, // U+0080 -> U+07FF, 110xxxxx 96 | { 0xF0, 0xE0, 2 }, // U+0800 -> U+FFFF, 1110xxxx 97 | { 0xF8, 0xF0, 3 }, // U+10000 -> U+10FFFF, 11110xxx 98 | }; 99 | 100 | const char *end = str + strlen(str); 101 | 102 | while (str < end) 103 | { 104 | int i = 0; 105 | 106 | for (i = 0; i < 4; ++i) 107 | { 108 | int res = check_utf8_sequence(str, end, utf8_lut[i].mask, utf8_lut[i].lead, utf8_lut[i].cont_len); 109 | 110 | // check if valid sequence but incorrect contiunation 111 | if (res < 0) 112 | { 113 | return 0; 114 | } 115 | 116 | // check if valid sequence 117 | if (res > 0) 118 | { 119 | str += utf8_lut[i].cont_len+1; 120 | break; 121 | } 122 | } 123 | 124 | // check if we had no valid sequences 125 | if (i == 4) 126 | { 127 | return 0; 128 | } 129 | } 130 | 131 | return 1; 132 | } 133 | 134 | /*! 135 | \brief Check whether a configuration is valid syntax. 136 | 137 | taihen_config_validate is used to check whether a provided configuration is valid syntax. 138 | This is useful when used before taihen_config_parse to provide error checking before stream based 139 | parsing. 140 | 141 | \param input A UTF-8 encoded null-terminated string containing the configuration to check. 142 | \return non-zero on valid configuration, else zero on invalid. 143 | \sa taihen_config_parse 144 | */ 145 | int taihen_config_validate(const char *input) 146 | { 147 | taihen_config_lexer ctx; 148 | taihen_config_init_lexer(&ctx, input); 149 | 150 | int have_section = 0; 151 | int lex_result = 0; 152 | 153 | while ((lex_result = taihen_config_lex(&ctx)) > 0) 154 | { 155 | switch (ctx.token) 156 | { 157 | case CONFIG_SECTION_NAME_TOKEN: 158 | // ensure we actually have a string 159 | if (strlen(ctx.line_pos) == 0) 160 | { 161 | return 0; 162 | } 163 | 164 | // validate it is UTF-8 165 | if (!check_utf8(ctx.line_pos)) 166 | { 167 | return 0; 168 | } 169 | 170 | have_section = 1; 171 | break; 172 | 173 | case CONFIG_PATH_TOKEN: 174 | if (!have_section) 175 | { 176 | // paths must belong to a section 177 | return 0; 178 | } 179 | 180 | // ensure we actually have a string 181 | if (strlen(ctx.line_pos) == 0) 182 | { 183 | return 0; 184 | } 185 | 186 | // validate it is UTF-8 187 | if (!check_utf8(ctx.line_pos)) 188 | { 189 | return 0; 190 | } 191 | 192 | break; 193 | 194 | // ignore these, nothing to check 195 | case CONFIG_SECTION_HALT_TOKEN: 196 | case CONFIG_COMMENT_TOKEN: 197 | case CONFIG_SECTION_TOKEN: 198 | case CONFIG_END_TOKEN: 199 | break; 200 | 201 | // unexpected tokens, invalid document 202 | default: 203 | return 0; 204 | } 205 | } 206 | 207 | return (lex_result == 0); 208 | } 209 | 210 | /*! 211 | \brief taihen_config_parse parses a configuration for contextualised paths. 212 | 213 | taihen_config_parse is used to obtain an ordered stream of the paths appropriate for the section provided. 214 | Special sections such as ALL and KERNEL will be taken into consideration when generating the stream. 215 | 216 | taihen_config_parse provides no error checking or handling. Use taihen_config_validate before parsing the 217 | document to avoid errors in parsing. 218 | 219 | \param input A UTF-8 encoded null-terminated string containing the configuration to parse. 220 | \param section A UTF-8 encoded null-terminated string containing the section to base context from. 221 | \param handler A taihen_config_handler to receive the stream of paths. 222 | \param param A user provided value that is passed to the provided taihen_config_handler. 223 | \sa taihen_config_validate 224 | */ 225 | void taihen_config_parse(const char *input, const char *section, taihen_config_handler handler, void *param) 226 | { 227 | taihen_config_lexer ctx; 228 | taihen_config_init_lexer(&ctx, input); 229 | 230 | int halt_flag = 0; 231 | int record_entries = 0; 232 | 233 | while (taihen_config_lex(&ctx) > 0) 234 | { 235 | switch (ctx.token) 236 | { 237 | case CONFIG_SECTION_HALT_TOKEN: 238 | halt_flag = 1; 239 | break; 240 | 241 | case CONFIG_SECTION_NAME_TOKEN: 242 | if (strcmp(ctx.line_pos, TOKEN_ALL_SECTION) == 0 && strcmp(section, TOKEN_KERNEL_SECTION) != 0) 243 | { 244 | record_entries = 1; 245 | } 246 | else if (strcmp(section, ctx.line_pos) == 0) 247 | { 248 | record_entries = 1; 249 | } 250 | else 251 | { 252 | record_entries = 0; 253 | } 254 | 255 | break; 256 | 257 | case CONFIG_SECTION_TOKEN: 258 | if (record_entries && halt_flag) 259 | { 260 | return; 261 | } 262 | 263 | halt_flag = 0; 264 | break; 265 | 266 | case CONFIG_PATH_TOKEN: 267 | if (record_entries) 268 | { 269 | handler(ctx.line_pos, param); 270 | } 271 | 272 | break; 273 | 274 | default: 275 | break; 276 | } 277 | } 278 | } 279 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(Boost COMPONENTS unit_test_framework REQUIRED) 2 | 3 | include_directories(${taihen-config_SOURCE_DIR}/src ${Boost_INCLUDE_DIRS}) 4 | link_directories(${Boost_LIBRARY_DIRS}) 5 | 6 | add_executable(test-lexer test_lexer.cpp) 7 | target_link_libraries(test-lexer taihenconfig) 8 | 9 | add_executable(test-parser test_parser.cpp) 10 | target_link_libraries(test-parser taihenconfig) 11 | -------------------------------------------------------------------------------- /test/test_lexer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define BOOST_TEST_MODULE lexer 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | BOOST_AUTO_TEST_CASE(init_lexer) 10 | { 11 | const char *input = ""; 12 | taihen_config_lexer ctx; 13 | 14 | // test NULL parameter handling 15 | BOOST_REQUIRE_LT(taihen_config_init_lexer(NULL, NULL), 0); 16 | BOOST_REQUIRE_LT(taihen_config_init_lexer(&ctx, NULL), 0); 17 | BOOST_REQUIRE_LT(taihen_config_init_lexer(NULL, input), 0); 18 | 19 | // test correct input 20 | BOOST_REQUIRE_GE(taihen_config_init_lexer(&ctx, input), 0); 21 | } 22 | 23 | BOOST_AUTO_TEST_CASE(empty_lex) 24 | { 25 | const char *input = ""; 26 | taihen_config_lexer ctx; 27 | 28 | BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0); 29 | 30 | // we should expect immediate end of stream 31 | BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0); 32 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN); 33 | } 34 | 35 | BOOST_AUTO_TEST_CASE(reset_lexer) 36 | { 37 | const char *input = ""; 38 | taihen_config_lexer ctx; 39 | 40 | BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0); 41 | 42 | // we should expect immediate end of stream 43 | BOOST_WARN_EQUAL(taihen_config_lex(&ctx), 0); 44 | BOOST_WARN_EQUAL(ctx.token, CONFIG_END_TOKEN); 45 | 46 | // reset the lexer 47 | BOOST_REQUIRE_GE(taihen_config_init_lexer(&ctx, input), 0); 48 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_START_TOKEN); 49 | } 50 | 51 | BOOST_AUTO_TEST_CASE(simple_section_lex) 52 | { 53 | const char *input = "*MY SECTION"; 54 | taihen_config_lexer ctx; 55 | 56 | BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0); 57 | 58 | // we should expect section token 59 | BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0); 60 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_TOKEN); 61 | 62 | // then we expect name 63 | BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0); 64 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_NAME_TOKEN); 65 | 66 | // check name is still "MY SECTION" 67 | BOOST_TEST(ctx.line_pos == "MY SECTION"); 68 | 69 | // then we expect end of stream 70 | BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0); 71 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN); 72 | } 73 | 74 | BOOST_AUTO_TEST_CASE(complex_section_lex) 75 | { 76 | const char *input = "*!MY SECTION"; 77 | taihen_config_lexer ctx; 78 | 79 | BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0); 80 | 81 | // we should expect section token 82 | BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0); 83 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_TOKEN); 84 | 85 | // we should expect section halt token 86 | BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0); 87 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_HALT_TOKEN); 88 | 89 | // then we expect name 90 | BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0); 91 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_NAME_TOKEN); 92 | 93 | // check name is still "MY SECTION" 94 | BOOST_TEST(ctx.line_pos == "MY SECTION"); 95 | 96 | // then we expect end of stream 97 | BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0); 98 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN); 99 | } 100 | 101 | 102 | BOOST_AUTO_TEST_CASE(whitespace_lex) 103 | { 104 | const char *input = "\t\t \t\t"; 105 | taihen_config_lexer ctx; 106 | 107 | BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0); 108 | 109 | // we should expect comment token 110 | BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0); 111 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_COMMENT_TOKEN); 112 | 113 | BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0); 114 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN); 115 | } 116 | 117 | BOOST_AUTO_TEST_CASE(comment_lex) 118 | { 119 | const char *input = "#this is a comment"; 120 | taihen_config_lexer ctx; 121 | 122 | BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0); 123 | 124 | // we should expect comment token 125 | BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0); 126 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_COMMENT_TOKEN); 127 | 128 | BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0); 129 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN); 130 | } 131 | 132 | BOOST_AUTO_TEST_CASE(path_lex) 133 | { 134 | const char *input = "this:/is/a/path"; 135 | taihen_config_lexer ctx; 136 | 137 | BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0); 138 | 139 | // we should expect path token, this isnt valid config syntax 140 | // but its not lexer job to ensure its correct order 141 | // it just tokenises the input 142 | BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0); 143 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_PATH_TOKEN); 144 | 145 | BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0); 146 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN); 147 | } 148 | 149 | BOOST_AUTO_TEST_CASE(random_lex) 150 | { 151 | std::random_device seed; 152 | std::mt19937_64 mt; 153 | std::vector line(255); 154 | taihen_config_lexer ctx; 155 | 156 | // seed mt from random device 157 | mt.seed(seed()); 158 | 159 | for (auto i = 0; i < 100000; ++i) 160 | { 161 | std::generate(line.begin(), std::prev(line.end()), mt); 162 | 163 | line[254] = '\0'; 164 | 165 | BOOST_WARN_GE(taihen_config_init_lexer(&ctx, (char *)(line.data())), 0); 166 | 167 | while(1) 168 | { 169 | int res = taihen_config_lex(&ctx); 170 | 171 | if (res < 0) 172 | { 173 | std::stringstream ss; 174 | 175 | ss << "on generated data: " << std::hex << std::setfill('0'); 176 | 177 | std::for_each(line.begin(), line.end(), [&ss](auto& v) 178 | { 179 | ss << std::setw(2) << static_cast(v); 180 | }); 181 | 182 | ss << std::endl; 183 | 184 | 185 | BOOST_TEST_REQUIRE(res >= 0, ss.str()); 186 | } 187 | 188 | if (res == 0) 189 | { 190 | break; 191 | } 192 | } 193 | } 194 | 195 | BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN); 196 | } 197 | 198 | BOOST_AUTO_TEST_CASE(long_line_lex) 199 | { 200 | char line[CONFIG_MAX_LINE_LENGTH+1]; 201 | taihen_config_lexer ctx; 202 | 203 | std::memset(line, 'a', sizeof(line)); 204 | line[CONFIG_MAX_LINE_LENGTH] = '\0'; 205 | 206 | BOOST_REQUIRE_GE(taihen_config_init_lexer(&ctx, line), 0); 207 | BOOST_REQUIRE_LT(taihen_config_lex(&ctx), 0); 208 | } 209 | -------------------------------------------------------------------------------- /test/test_parser.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define BOOST_TEST_MODULE parser 4 | #include 5 | 6 | BOOST_AUTO_TEST_CASE(removed_for_now) 7 | { 8 | 9 | } 10 | --------------------------------------------------------------------------------