├── CMakeLists.txt
├── LICENSE
├── README.md
├── include
    └── taihen
    │   ├── lexer.h
    │   └── parser.h
├── src
    ├── CMakeLists.txt
    ├── lexer.c
    └── parser.c
└── test
    ├── CMakeLists.txt
    ├── test_lexer.cpp
    └── test_parser.cpp


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.1.0)
 2 | 
 3 | project(taihen-parser)
 4 | 
 5 | option(TEST "build and perform tests" OFF)
 6 | option(INSTALL_ENABLED "if enabled add files to cmake's install()" ON)
 7 | option(USE_INBUILT_CTYPE "use internal ctype over system" OFF)
 8 | option(USE_INBUILT_STRING "use internal string over system" OFF)
 9 | 
10 | include_directories(include)
11 | 
12 | add_subdirectory(src)
13 | 
14 | if (${TEST})
15 |     add_subdirectory(test)
16 | 
17 |     enable_testing()
18 |     add_test(NAME LexerTest COMMAND test-lexer)
19 | endif()
20 | 
21 | if (${INSTALL_ENABLED})
22 | install(DIRECTORY include/taihen/
23 |     DESTINATION include/taihen
24 |     FILES_MATCHING PATTERN "*.h")
25 | endif()
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 David "Davee" Morgan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # taihen-parser - _taiHEN's configuration parser_
  2 | 
  3 | taiHEN is a custom firmware (CFW) framework for PS Vita™ and implements a configuration to help packagers and users control which modules or plugins are loaded and when.
  4 | taihen-parser provides a convenient C API for interacting with these configuration files to help developers write supporting tools for taiHEN. taihen-parser provides both a lexer and parser API for configuration files.
  5 | 
  6 | The problem with CFW of a previous era was that it was one person's imagination of a custom firmware. Average developer X could not easily replace the in-game menu within the provided CFW. Likewise, average user Y could not strip out features they did not like. Here, taiHEN provides a solution by providing a configuration file where CFW can be defined as a set of modules and plugins. 
  7 | 
  8 | Person X may like live-area mod1 and person Y may like live-area mod2. No longer do these two need to chose between CFW A and CFW B that implements mod1 and mod2, respectively. Instead, they can modify this configuration from their favourite CFW to use whichever mod they prefer. This architecture promotes the _custom_ in _custom firmware_ by encouraging developers to move away from huge monolithic CFW of the past and help nurture an open, compatible and _user orientated_ custom firmware experience.
  9 | 
 10 | ## Configuration Format
 11 | taiHEN employs a text based format for configuring automatic loading of modules. The configuration format is a UTF-8 text file that utilises line seperation to ease parsing and human readability. Each line must be exclusive to one of four types:
 12 |  - An empty line
 13 |  - A comment
 14 |  - Section
 15 |  - Module path
 16 | 
 17 | Each line can be at most ```CONFIG_MAX_LINE_LENGTH``` characters wide, and trailing/leading whitespace is permitted.
 18 | 
 19 | ## Lexer Tokens
 20 | The config lexer produces the following tokens:
 21 |  - ```CONFIG_START_TOKEN```
 22 |  - ```CONFIG_END_TOKEN```
 23 |  - ```CONFIG_COMMENT_TOKEN```
 24 |  - ```CONFIG_SECTION_TOKEN```
 25 |  - ```CONFIG_SECTION_HALT_TOKEN```
 26 |  - ```CONFIG_SECTION_NAME_TOKEN```
 27 |  - ```CONFIG_PATH_TOKEN```
 28 | 
 29 | A valid configuration format should obey the grammar:
 30 | ```
 31 | config ::= CONFIG_START_TOKEN (CONFIG_COMMENT_TOKEN | section)* CONFIG_END_TOKEN
 32 | section ::= CONFIG_SECTION_TOKEN CONFIG_SECTION_HALT_TOKEN? CONFIG_SECTION_NAME_TOKEN ('\n' | EOF) path*
 33 | path ::= CONFIG_PATH_TOKEN ('\n' | EOF)
 34 | ```
 35 | 
 36 | ## Sections: ```*```
 37 | A section in the configuration file functions as a filter and controller for CFW module loading.
 38 | Each section begins with a ```*``` and can optionally be followed with a ```!``` to mark the section as a halt point (see further below). After these tokens, the rest of the line a UTF-8 name for the section.
 39 | 
 40 | A section of the same name may appear in the file multiple times. This functionality is intended to allow users to take advantage of taiHEN's load ordering policy.
 41 | 
 42 | ### Halt point: ```!```
 43 | A section can optionally have the halt point token ```!``` following the section token ```*``` in the configuration file. This token instructs the parser to stop further parsing of the file if the section name is within context. See the examples below for a visual worked case on this feature.
 44 | 
 45 | ### Reserved names
 46 | There are currently two reserved names for sections:
 47 |  - ```ALL``` - A catch all user-mode section that will load the modules it contains for every user-mode process.
 48 |  - ```KERNEL``` - A section that loads resident kernel modules on the start of taiHEN.
 49 | 
 50 | Using the halt point ```!``` on these sections results in undefined behaviour.
 51 | 
 52 | ## API
 53 | This API currently offers no guarantee of stability. Please remember that it may change drastically upon any future versions of taiHEN.
 54 | taiHEN's configuration parser exposes it's lexer algorithm to assist in development of supporting tools. Please consult the header files for documentation.
 55 | 
 56 | ## Example Configurations
 57 | 
 58 | Below is an example of a very simple configuration:
 59 | ```
 60 | # example simple config
 61 | *ALL
 62 | ux0:/plugins/my_plugin.suprx
 63 | ux0:/plugins/my_plugin2.suprx
 64 | ```
 65 | 
 66 | This example consists of a single section ```ALL```. Which means that every game/application/homebrew that is launched will have both ```my_plugin.suprx``` and ```my_plugin2.suprx``` loaded in that process space and in order.
 67 | 
 68 | More precise functionality may be required for certain homebrew. Perhaps you wish to package your own CFW, in which case you may create a complex configuration as shown below:
 69 | ```
 70 | # hello this is a comment. this line is ignored
 71 |    # this line also
 72 |         # this too, whitespace at the start of a line is OK
 73 | *COOL_GAME
 74 | # i'm within a section, woo!
 75 |     ux0:/coolgame/plugin.suprx
 76 |     # indentation is ok with me
 77 |     ux0:/coolgame/plugin2.suprx
 78 | 	# spaces within path is ok
 79 | 	ux0:/really cool/I haVe spaces and caps/plugin3.suprx
 80 | # next section
 81 | *ALL
 82 |     # i'm a special section!
 83 |     # i'm always included... usually
 84 |     ux0:/plugins/ingamemusic.suprx
 85 | *KERNEL
 86 |     # i'm a special section also!
 87 |     # my plugins are loaded to kernel memory as resident modules
 88 |     ux0:/taihen/henkaku.skprx
 89 |     ux0:/psphacks.skprx
 90 | *COOL_GAME
 91 |     # this section again?! this is ok! this is a way packagers
 92 |     # can take advantage of load order.
 93 |     ux0:/coolgame/idependoningamemusic.suprx
 94 | *!COOL_GAME2
 95 |     # what is the '!' for?
 96 |     # the '!' prevents further parsing
 97 |     # this would make more sense to put at the start if you want to
 98 |     # blacklist certain modules
 99 |     # look, nothing to load!
100 | *ALL
101 |     ux0:/plugins/ibreak_coolgame2.suprx
102 | 	
103 | 	# emojis?
104 | 	ux0:/🤔/🦄/👻/🎃.suprx
105 | ```
106 | Much more complex, but really I expect even more complexity when real CFW components come around. As mentioned previously, parsing occurs from top to bottom, identical to load order. When parsing, a section context is selected. In the case of taiHEN, this context is a title id such as ```MLCL00001``` for our molecularShell homebrew. In this case, lets assume for ease that we have selected ```COOL_GAME``` and it is a user-mode process.
107 | 
108 | Comments are ignored, so lets continue until we reach the first section: ```COOL_GAME```. Since our selected section matches this first section, the paths below are loaded until a new section is reached.
109 |  - ```ux0:/coolgame/plugin.suprx```
110 |  - ```ux0:/coolgame/plugin2.suprx```
111 |  - ```ux0:/really cool/I haVe spaces and caps/plugin3.suprx```
112 | 
113 | Then we reach a new section ```ALL```. As mentioned above, ```ALL``` is a special reserved section name that matches every user-mode process. So our loaded module list grows:
114 |  - ```ux0:/coolgame/plugin.suprx```
115 |  - ```ux0:/coolgame/plugin2.suprx```
116 |  - ```ux0:/really cool/I haVe spaces and caps/plugin3.suprx```
117 |  - ```ux0:/plugins/ingamemusic.suprx```
118 | 
119 | Next section we reach is the special section ```KERNEL```. This is not processed within our context we so continue until we reach the next section: ```COOL_GAME```. We have already had this section before, but we multiple sections are allowed to take advantage of taiHEN's module load ordering. This is extremely useful when you have dependencies between plugins/modules that need resolved. In this example we have ```idependoningamemusic.suprx``` which must be loaded after ```ingamemusic.suprx```.
120 | 
121 | Our load list now looks like:
122 |  - ```ux0:/coolgame/plugin.suprx```
123 |  - ```ux0:/coolgame/plugin2.suprx```
124 |  - ```ux0:/really cool/I haVe spaces and caps/plugin3.suprx```
125 |  - ```ux0:/plugins/ingamemusic.suprx```
126 |  - ```ux0:/coolgame/idependoningamemusic.suprx```
127 | 
128 | Next section is ```COOL_GAME2``` which does not match our section context. This section has a halt point ```!``` but we ignore it in this case because we do not much it.
129 | 
130 | Lastly, we have the final section ```ALL``` again, which completes our load list:
131 |  - ```ux0:/coolgame/plugin.suprx```
132 |  - ```ux0:/coolgame/plugin2.suprx```
133 |  - ```ux0:/really cool/I haVe spaces and caps/plugin3.suprx```
134 |  - ```ux0:/plugins/ingamemusic.suprx```
135 |  - ```ux0:/coolgame/idependoningamemusic.suprx```
136 |  - ```ux0:/plugins/ibreak_coolgame2.suprx```
137 |  - ```ux0:/🤔/🦄/👻/🎃.suprx```
138 | 
139 | NOTE: I don't know conclusively if the Vita filesystem supports emojis. Don't use them...
140 | 
141 | ### ```COOL_GAME2``` Halt Point Example
142 | Following the same logic as above, we will walk through the configuration as ```COOL_GAME2``` context.
143 | 
144 | First section is ```COOL_GAME```, not a match so we skip it.
145 | 
146 | Second section is ```ALL```, so we load modules from it:
147 |  - ```ux0:/plugins/ingamemusic.suprx```
148 | 
149 | Third section is ```KERNEL```, so we skip it.
150 | 
151 | Fourth section is ```COOL_GAME``` again so we skip it.
152 | 
153 | Fifth section is ```COOL_GAME2``` so we process it. This time we have a halt point so this will be the last section we process. Remember, the halt point ```!``` stops any further parsing. This section however has no modules, so nothing is loaded. A section with no modules is OK. In this case, the following ```ALL``` section breaks ```COOL_GAME2``` in our hypothetical world. By using the halt point correctly, a CFW packager can maximise compatibility whilst maintaining load ordering.
154 | 
155 | Our final module loading list for ```COOL_GAME2```:
156 |  - ```ux0:/plugins/ingamemusic.suprx```
157 | 
158 | # Building
159 | To build taihen-parser, you require CMake to generate the appropriate build scripts.
160 | From within the repository directory:
161 | ```sh
162 | $ mkdir build && cd build
163 | $ cmake ..
164 | $ make
165 | ```
166 | 
167 | To build the included tests you require the boost ```unit_test_framework``` installed. Then instead use:
168 | ```sh
169 | $ mkdir build && cd build
170 | $ cmake -DTEST=ON ..
171 | $ make
172 | ```
173 | 
174 | # Installation
175 | To install to a specified location define ```CMAKE_INSTALL_PREFIX```:
176 | ```sh
177 | $ mkdir build && cd build
178 | $ cmake -DCMAKE_INSTALL_PREFIX=/my/install/location ..
179 | $ make
180 | $ make install
181 | ```
182 | 
183 | # Acknowledgements
184 | Team molecule for HENkaku, Yifan Lu for taiHEN and xyz for immense support of the vitasdk.
185 | 
186 | ## License
187 | taihen-parser is licensed under the terms of the MIT license which can be read in the ```LICENSE``` file in the root of the repository.
188 | (C) 2016 David "Davee" Morgan
189 | 


--------------------------------------------------------------------------------
/include/taihen/lexer.h:
--------------------------------------------------------------------------------
 1 | #ifndef LEXER_H
 2 | #define LEXER_H
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | #define CONFIG_MAX_LINE_LENGTH (256)
 9 | 
10 | typedef enum
11 | {
12 |     CONFIG_START_TOKEN,
13 |     CONFIG_END_TOKEN,
14 |     CONFIG_COMMENT_TOKEN,
15 |     CONFIG_SECTION_TOKEN,
16 |     CONFIG_SECTION_HALT_TOKEN,
17 |     CONFIG_SECTION_NAME_TOKEN,
18 |     CONFIG_PATH_TOKEN
19 | } taihen_config_lexer_token;
20 | 
21 | typedef struct
22 | {
23 |     const char *input;
24 |     const char *end;
25 |     taihen_config_lexer_token token;
26 |     char line[CONFIG_MAX_LINE_LENGTH];
27 |     char *line_pos;
28 | } taihen_config_lexer;
29 | 
30 | int taihen_config_init_lexer(taihen_config_lexer *ctx, const char *input);
31 | int taihen_config_lex(taihen_config_lexer *ctx);
32 | 
33 | #ifdef __cplusplus
34 | }
35 | #endif
36 | #endif // LEXER_H
37 | 


--------------------------------------------------------------------------------
/include/taihen/parser.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARSER_H
 2 | #define PARSER_H
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | typedef void (* taihen_config_handler)(const char *module, void *param);
 9 | 
10 | int taihen_config_validate(const char *input);
11 | void taihen_config_parse(const char *input, const char *section, taihen_config_handler handler, void *param);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | #endif // PARSER_H
17 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(CheckIncludeFile)
 2 | 
 3 | if (MSVC)
 4 |     add_definitions(-Dinline=__inline)
 5 | endif()
 6 | 
 7 | check_include_file(ctype.h HAVE_CTYPE)
 8 | check_include_file(string.h HAVE_STRING)
 9 | 
10 | if ((NOT ${HAVE_CTYPE}) OR ${USE_INBUILT_CTYPE})
11 |     add_definitions(-DNO_CTYPE)
12 | endif()
13 | 
14 | if ((NOT ${HAVE_STRING}) OR ${USE_INBUILT_STRING})
15 |     add_definitions(-DNO_STRING)
16 | endif()
17 | 
18 | add_definitions(-Os -ffunction-sections -fdata-sections)
19 | 
20 | add_library(taihenparser lexer.c parser.c)
21 | 
22 | if (${INSTALL_ENABLED})
23 | install(TARGETS taihenparser
24 |     RUNTIME DESTINATION bin
25 |     LIBRARY DESTINATION lib
26 |     ARCHIVE DESTINATION lib)
27 | endif()
28 | 


--------------------------------------------------------------------------------
/src/lexer.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * lexer.c - tokenisation algorithm for taihen configuration files
  3 |  *
  4 |  * Copyright (C) 2016 David "Davee" Morgan
  5 |  *
  6 |  * This software may be modified and distributed under the terms
  7 |  * of the MIT license.  See the LICENSE file for details.
  8 |  */
  9 | 
 10 | #include <taihen/lexer.h>
 11 | 
 12 | #ifndef NO_STRING
 13 | #include <string.h>
 14 | #endif // NO_STRING
 15 | #ifndef NO_CTYPE
 16 | #include <ctype.h>
 17 | #endif // NO_CTYPE
 18 | 
 19 | static const char TOKEN_EMPTY = '\0';
 20 | static const char TOKEN_COMMENT_START = '#';
 21 | static const char TOKEN_SECTION_START = '*';
 22 | static const char TOKEN_HALT = '!';
 23 | 
 24 | #ifdef NO_CTYPE
 25 | static int isspace(int c)
 26 | {
 27 |     // we use "C"  locale
 28 |     return      (c == ' ')
 29 |             ||  (c == '\t')
 30 |             ||  (c == '\n')
 31 |             ||  (c == '\v')
 32 |             ||  (c == '\f')
 33 |             ||  (c == '\r');
 34 | }
 35 | #endif // NO_CTYPE
 36 | 
 37 | #ifdef NO_STRING
 38 | #include <stddef.h>
 39 | 
 40 | static size_t strlen(const char *s)
 41 | {
 42 |     size_t idx = 0;
 43 | 
 44 |     while (s[idx])
 45 |     {
 46 |         ++idx;
 47 |     }
 48 | 
 49 |     return idx;
 50 | }
 51 | 
 52 | static void *memset(void *s, int c, size_t len)
 53 | {
 54 |     unsigned char *p = (unsigned char *)s;
 55 | 
 56 |     while (len--)
 57 |     {
 58 |         *p++ = (unsigned char)c;
 59 |     }
 60 | 
 61 |     return s;
 62 | }
 63 | 
 64 | static void *memcpy(void *s1, const void * s2, size_t len)
 65 | {
 66 |     char *dest = (char *)s1;
 67 |     const char *src = (const char *)s2;
 68 | 
 69 |     while (len--)
 70 |     {
 71 |         *dest++ = *src++;
 72 |     }
 73 | 
 74 |     return s1;
 75 | }
 76 | #endif // NO_STRING
 77 | 
 78 | static char *skip_whitespace(char *input)
 79 | {
 80 |     while (isspace((unsigned char)*input))
 81 |     {
 82 |         ++input;
 83 |     }
 84 | 
 85 |     return input;
 86 | }
 87 | 
 88 | static void trim_whitespace(char *input)
 89 | {
 90 |     char *end = input + strlen(input)-1;
 91 | 
 92 |     while (end > input)
 93 |     {
 94 |         if (!isspace((unsigned char)*end))
 95 |         {
 96 |             break;
 97 |         }
 98 | 
 99 |         *end = '\0';
100 |         end--;
101 |     }
102 | }
103 | 
104 | static const char *get_newline(const char *input)
105 | {
106 |     while (*input)
107 |     {
108 |         if (*input == '\r' || *input == '\n')
109 |         {
110 |             break;
111 |         }
112 | 
113 |         ++input;
114 |     }
115 | 
116 |     return input;
117 | }
118 | 
119 | static int lex_line(taihen_config_lexer *ctx)
120 | {
121 |     if (ctx->input >= ctx->end)
122 |     {
123 |         ctx->token = CONFIG_END_TOKEN;
124 |         return 0;
125 |     }
126 | 
127 |     const char *line_end = get_newline(ctx->input);
128 |     size_t len = line_end - ctx->input;
129 | 
130 | 
131 |     // check our line can fit in our buffer
132 |     if (len >= CONFIG_MAX_LINE_LENGTH)
133 |     {
134 |         return -1;
135 |     }
136 | 
137 |     // copy line to our buffer so we can modify it
138 |     memcpy(ctx->line, ctx->input, len);
139 |     ctx->line[len] = '\0';
140 |     ctx->line_pos = ctx->line;
141 |     ctx->input = line_end+1;
142 | 
143 |     // remove leading whitespace
144 |     ctx->line_pos = skip_whitespace(ctx->line_pos);
145 | 
146 |     // check for empty line or comment
147 |     if (*ctx->line_pos == TOKEN_EMPTY || *ctx->line_pos == TOKEN_COMMENT_START)
148 |     {
149 |         ctx->token = CONFIG_COMMENT_TOKEN;
150 |         return 1;
151 |     }
152 | 
153 |     // remove any trailing whitespace
154 |     trim_whitespace(ctx->line_pos);
155 | 
156 |     // check if our line is empty now
157 |     if (*ctx->line_pos == TOKEN_EMPTY)
158 |     {
159 |         ctx->token = CONFIG_COMMENT_TOKEN;
160 |         return 1;
161 |     }
162 | 
163 |     // check for section start
164 |     if (*ctx->line_pos == TOKEN_SECTION_START)
165 |     {
166 |         ctx->token = CONFIG_SECTION_TOKEN;
167 |     }
168 |     else
169 |     {
170 |         // should be a path
171 |         ctx->token = CONFIG_PATH_TOKEN;
172 |     }
173 | 
174 |     return 1;
175 | }
176 | 
177 | static int lex_section_halt(taihen_config_lexer *ctx)
178 | {
179 |     // skip more whitespace
180 |     ctx->line_pos = skip_whitespace(ctx->line_pos+1);
181 | 
182 |     // check for halt token
183 |     if (*ctx->line_pos == TOKEN_HALT)
184 |     {
185 |         ctx->token = CONFIG_SECTION_HALT_TOKEN;
186 |     }
187 |     else
188 |     {
189 |         // should be a name
190 |         ctx->token = CONFIG_SECTION_NAME_TOKEN;
191 |     }
192 | 
193 |     return 1;
194 | }
195 | 
196 | static int lex_section_name(taihen_config_lexer *ctx)
197 | {
198 |     // skip more whitespace
199 |     ctx->line_pos = skip_whitespace(ctx->line_pos+1);
200 | 
201 |     // should be a name
202 |     ctx->token = CONFIG_SECTION_NAME_TOKEN;
203 |     return 1;
204 | }
205 | 
206 | /*!
207 |     \brief Initialise or reset lexer context.
208 | 
209 |     taihen_config_init_lexer will init/reset the provided taihen_config_lexer and assign the
210 |     provided input to the context.
211 | 
212 |     \param ctx      A non-null pointer to a context to initialise or reset.
213 |     \param input    A non-null UTF-8 encoded null-terminated string to tokenise.
214 |     \return zero on success, < 0 on error.
215 |  */
216 | int taihen_config_init_lexer(taihen_config_lexer *ctx, const char *input)
217 | {
218 |     if (ctx == NULL || input == NULL)
219 |     {
220 |         return -1;
221 |     }
222 | 
223 |     // reset everything to default and reset input/end pointer
224 |     memset(ctx, 0, sizeof(taihen_config_lexer));
225 |     ctx->token = CONFIG_START_TOKEN;
226 |     ctx->input = input;
227 |     ctx->end = input + strlen(input);
228 |     return 0;
229 | }
230 | 
231 | /*!
232 |     \brief Retrieve the next lexer token.
233 | 
234 |     taihen_config_lex will accept an initialised context and provide the next token
235 |     in the stream. This tokenisation does no checking on formatting and as such does not
236 |     confirm that the document provided is well-formed.
237 | 
238 |     \param ctx  A non-null point to an initialised context.
239 |     \return 0 if there are no further tokens, > 0 if there are further tokens else < 0 on error.
240 |     \sa taihen_config_init_lexer
241 |  */
242 | int taihen_config_lex(taihen_config_lexer *ctx)
243 | {
244 |     if (ctx == NULL)
245 |     {
246 |         return -1;
247 |     }
248 | 
249 |     switch (ctx->token)
250 |     {
251 |     case CONFIG_START_TOKEN:
252 |     case CONFIG_COMMENT_TOKEN:
253 |     case CONFIG_PATH_TOKEN:
254 |     case CONFIG_SECTION_NAME_TOKEN:
255 |         return lex_line(ctx);
256 | 
257 |     case CONFIG_SECTION_TOKEN:
258 |         return lex_section_halt(ctx);
259 | 
260 |     case CONFIG_SECTION_HALT_TOKEN:
261 |         return lex_section_name(ctx);
262 | 
263 |     case CONFIG_END_TOKEN:
264 |     default:
265 |         return -1;
266 |     }
267 | }
268 | 


--------------------------------------------------------------------------------
/src/parser.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * parser.c - parser algorithm for taihen configuration files
  3 |  *
  4 |  * Copyright (C) 2016 David "Davee" Morgan
  5 |  *
  6 |  * This software may be modified and distributed under the terms
  7 |  * of the MIT license.  See the LICENSE file for details.
  8 |  */
  9 | 
 10 | #include <taihen/parser.h>
 11 | #include <taihen/lexer.h>
 12 | 
 13 | #ifndef NO_STRING
 14 | #include <string.h>
 15 | #endif // NO_STRING
 16 | 
 17 | static const char *TOKEN_ALL_SECTION = "ALL";
 18 | static const char *TOKEN_KERNEL_SECTION = "KERNEL";
 19 | 
 20 | #ifdef NO_STRING
 21 | #include <stddef.h>
 22 | 
 23 | static size_t strlen(const char *s)
 24 | {
 25 |     size_t idx = 0;
 26 | 
 27 |     while (s[idx])
 28 |     {
 29 |         ++idx;
 30 |     }
 31 | 
 32 |     return idx;
 33 | }
 34 | 
 35 | static int strcmp(const char * s1, const char * s2)
 36 | {
 37 |     while ((*s1) && (*s1 == *s2))
 38 |     {
 39 |         ++s1;
 40 |         ++s2;
 41 |     }
 42 |     return (*s1 - *s2);
 43 | }
 44 | 
 45 | #endif // NO_STRING
 46 | 
 47 | static inline int is_continuation_byte(char b)
 48 | {
 49 |     return ((b & 0xC0) == 0x80);
 50 | }
 51 | 
 52 | static inline int check_continuation_bytes(const char *start, const char *end, int len)
 53 | {
 54 |     if ((end - start) < len)
 55 |     {
 56 |         return 0;
 57 |     }
 58 | 
 59 |     for (int i = 0; i < len; ++i)
 60 |     {
 61 |         if (!is_continuation_byte(start[i]))
 62 |         {
 63 |             return 0;
 64 |         }
 65 |     }
 66 | 
 67 |     return 1;
 68 | }
 69 | 
 70 | static int check_utf8_sequence(const char *str, const char *end, unsigned char mask, unsigned char lead, int cont_len)
 71 | {
 72 |     if ((*str & mask) == lead)
 73 |     {
 74 |         if (check_continuation_bytes(str+1, end, cont_len))
 75 |         {
 76 |             return -1;
 77 |         }
 78 | 
 79 |         return 1;
 80 |     }
 81 | 
 82 |     return 0;
 83 | }
 84 | 
 85 | static int check_utf8(const char *str)
 86 | {
 87 |     struct
 88 |     {
 89 |         unsigned char mask;
 90 |         unsigned char lead;
 91 |         unsigned char cont_len;
 92 |     } utf8_lut[4] =
 93 |     {
 94 |         { 0x80, 0x00, 0 }, // U+0000 -> U+007F, 0xxxxxx
 95 |         { 0xE0, 0xC0, 1 }, // U+0080 -> U+07FF, 110xxxxx
 96 |         { 0xF0, 0xE0, 2 }, // U+0800 -> U+FFFF, 1110xxxx
 97 |         { 0xF8, 0xF0, 3 }, // U+10000 -> U+10FFFF, 11110xxx
 98 |     };
 99 | 
100 |     const char *end = str + strlen(str);
101 | 
102 |     while (str < end)
103 |     {
104 |         int i = 0;
105 | 
106 |         for (i = 0; i < 4; ++i)
107 |         {
108 |             int res = check_utf8_sequence(str, end, utf8_lut[i].mask, utf8_lut[i].lead, utf8_lut[i].cont_len);
109 | 
110 |             // check if valid sequence but incorrect contiunation
111 |             if (res < 0)
112 |             {
113 |                 return 0;
114 |             }
115 | 
116 |             // check if valid sequence
117 |             if (res > 0)
118 |             {
119 |                 str += utf8_lut[i].cont_len+1;
120 |                 break;
121 |             }
122 |         }
123 | 
124 |         // check if we had no valid sequences
125 |         if (i == 4)
126 |         {
127 |             return 0;
128 |         }
129 |     }
130 | 
131 |     return 1;
132 | }
133 | 
134 | /*!
135 |     \brief Check whether a configuration is valid syntax.
136 | 
137 |     taihen_config_validate is used to check whether a provided configuration is valid syntax.
138 |     This is useful when used before taihen_config_parse to provide error checking before stream based
139 |     parsing.
140 | 
141 |     \param input A UTF-8 encoded null-terminated string containing the configuration to check.
142 |     \return non-zero on valid configuration, else zero on invalid.
143 |     \sa taihen_config_parse
144 |  */
145 | int taihen_config_validate(const char *input)
146 | {
147 |     taihen_config_lexer ctx;
148 |     taihen_config_init_lexer(&ctx, input);
149 | 
150 |     int have_section = 0;
151 |     int lex_result = 0;
152 | 
153 |     while ((lex_result = taihen_config_lex(&ctx)) > 0)
154 |     {
155 |         switch (ctx.token)
156 |         {
157 |         case CONFIG_SECTION_NAME_TOKEN:
158 |             // ensure we actually have a string
159 |             if (strlen(ctx.line_pos) == 0)
160 |             {
161 |                 return 0;
162 |             }
163 | 
164 |             // validate it is UTF-8
165 |             if (!check_utf8(ctx.line_pos))
166 |             {
167 |                 return 0;
168 |             }
169 | 
170 |             have_section = 1;
171 |             break;
172 | 
173 |         case CONFIG_PATH_TOKEN:
174 |             if (!have_section)
175 |             {
176 |                 // paths must belong to a section
177 |                 return 0;
178 |             }
179 | 
180 |             // ensure we actually have a string
181 |             if (strlen(ctx.line_pos) == 0)
182 |             {
183 |                 return 0;
184 |             }
185 | 
186 |             // validate it is UTF-8
187 |             if (!check_utf8(ctx.line_pos))
188 |             {
189 |                 return 0;
190 |             }
191 | 
192 |             break;
193 | 
194 |         // ignore these, nothing to check
195 |         case CONFIG_SECTION_HALT_TOKEN:
196 |         case CONFIG_COMMENT_TOKEN:
197 |         case CONFIG_SECTION_TOKEN:
198 |         case CONFIG_END_TOKEN:
199 |             break;
200 | 
201 |         // unexpected tokens, invalid document
202 |         default:
203 |             return 0;
204 |         }
205 |     }
206 | 
207 |     return (lex_result == 0);
208 | }
209 | 
210 | /*!
211 |     \brief taihen_config_parse parses a configuration for contextualised paths.
212 | 
213 |     taihen_config_parse is used to obtain an ordered stream of the paths appropriate for the section provided.
214 |     Special sections such as ALL and KERNEL will be taken into consideration when generating the stream.
215 | 
216 |     taihen_config_parse provides no error checking or handling. Use taihen_config_validate before parsing the
217 |     document to avoid errors in parsing.
218 | 
219 |    \param input     A UTF-8 encoded null-terminated string containing the configuration to parse.
220 |    \param section   A UTF-8 encoded null-terminated string containing the section to base context from.
221 |    \param handler   A taihen_config_handler to receive the stream of paths.
222 |    \param param     A user provided value that is passed to the provided taihen_config_handler.
223 |    \sa taihen_config_validate
224 |  */
225 | void taihen_config_parse(const char *input, const char *section, taihen_config_handler handler, void *param)
226 | {
227 |     taihen_config_lexer ctx;
228 |     taihen_config_init_lexer(&ctx, input);
229 | 
230 |     int halt_flag = 0;
231 |     int record_entries = 0;
232 | 
233 |     while (taihen_config_lex(&ctx) > 0)
234 |     {
235 |         switch (ctx.token)
236 |         {
237 |         case CONFIG_SECTION_HALT_TOKEN:
238 |             halt_flag = 1;
239 |             break;
240 | 
241 |         case CONFIG_SECTION_NAME_TOKEN:
242 |             if (strcmp(ctx.line_pos, TOKEN_ALL_SECTION) == 0 && strcmp(section, TOKEN_KERNEL_SECTION) != 0)
243 |             {
244 |                 record_entries = 1;
245 |             }
246 |             else if (strcmp(section, ctx.line_pos) == 0)
247 |             {
248 |                 record_entries = 1;
249 |             }
250 |             else
251 |             {
252 |                 record_entries = 0;
253 |             }
254 | 
255 |             break;
256 | 
257 |         case CONFIG_SECTION_TOKEN:
258 |             if (record_entries && halt_flag)
259 |             {
260 |                 return;
261 |             }
262 | 
263 |             halt_flag = 0;
264 |             break;
265 | 
266 |         case CONFIG_PATH_TOKEN:
267 |             if (record_entries)
268 |             {
269 |                 handler(ctx.line_pos, param);
270 |             }
271 | 
272 |             break;
273 | 
274 |         default:
275 |             break;
276 |         }
277 |     }
278 | }
279 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | find_package(Boost COMPONENTS unit_test_framework REQUIRED)
 2 | 
 3 | include_directories(${taihen-config_SOURCE_DIR}/src ${Boost_INCLUDE_DIRS})
 4 | link_directories(${Boost_LIBRARY_DIRS})
 5 | 
 6 | add_executable(test-lexer test_lexer.cpp)
 7 | target_link_libraries(test-lexer taihenconfig)
 8 | 
 9 | add_executable(test-parser test_parser.cpp)
10 | target_link_libraries(test-parser taihenconfig)
11 | 


--------------------------------------------------------------------------------
/test/test_lexer.cpp:
--------------------------------------------------------------------------------
  1 | #include <taihen/lexer.h>
  2 | 
  3 | #define BOOST_TEST_MODULE lexer
  4 | #include <boost/test/unit_test.hpp>
  5 | 
  6 | #include <random>
  7 | #include <iomanip>
  8 | 
  9 | BOOST_AUTO_TEST_CASE(init_lexer)
 10 | {
 11 |     const char *input = "";
 12 |     taihen_config_lexer ctx;
 13 | 
 14 |     // test NULL parameter handling
 15 |     BOOST_REQUIRE_LT(taihen_config_init_lexer(NULL, NULL), 0);
 16 |     BOOST_REQUIRE_LT(taihen_config_init_lexer(&ctx, NULL), 0);
 17 |     BOOST_REQUIRE_LT(taihen_config_init_lexer(NULL, input), 0);
 18 | 
 19 |     // test correct input
 20 |     BOOST_REQUIRE_GE(taihen_config_init_lexer(&ctx, input), 0);
 21 | }
 22 | 
 23 | BOOST_AUTO_TEST_CASE(empty_lex)
 24 | {
 25 |     const char *input = "";
 26 |     taihen_config_lexer ctx;
 27 | 
 28 |     BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0);
 29 | 
 30 |     // we should expect immediate end of stream
 31 |     BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0);
 32 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN);
 33 | }
 34 | 
 35 | BOOST_AUTO_TEST_CASE(reset_lexer)
 36 | {
 37 |     const char *input = "";
 38 |     taihen_config_lexer ctx;
 39 | 
 40 |     BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0);
 41 | 
 42 |     // we should expect immediate end of stream
 43 |     BOOST_WARN_EQUAL(taihen_config_lex(&ctx), 0);
 44 |     BOOST_WARN_EQUAL(ctx.token, CONFIG_END_TOKEN);
 45 | 
 46 |     // reset the lexer
 47 |     BOOST_REQUIRE_GE(taihen_config_init_lexer(&ctx, input), 0);
 48 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_START_TOKEN);
 49 | }
 50 | 
 51 | BOOST_AUTO_TEST_CASE(simple_section_lex)
 52 | {
 53 |     const char *input = "*MY SECTION";
 54 |     taihen_config_lexer ctx;
 55 | 
 56 |     BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0);
 57 | 
 58 |     // we should expect section token
 59 |     BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
 60 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_TOKEN);
 61 | 
 62 |     // then we expect name
 63 |     BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
 64 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_NAME_TOKEN);
 65 | 
 66 |     // check name is still "MY SECTION"
 67 |     BOOST_TEST(ctx.line_pos == "MY SECTION");
 68 | 
 69 |     // then we expect end of stream
 70 |     BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0);
 71 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN);
 72 | }
 73 | 
 74 | BOOST_AUTO_TEST_CASE(complex_section_lex)
 75 | {
 76 |     const char *input = "*!MY SECTION";
 77 |     taihen_config_lexer ctx;
 78 | 
 79 |     BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0);
 80 | 
 81 |     // we should expect section token
 82 |     BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
 83 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_TOKEN);
 84 | 
 85 |     // we should expect section halt token
 86 |     BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
 87 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_HALT_TOKEN);
 88 | 
 89 |     // then we expect name
 90 |     BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
 91 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_NAME_TOKEN);
 92 | 
 93 |     // check name is still "MY SECTION"
 94 |     BOOST_TEST(ctx.line_pos == "MY SECTION");
 95 | 
 96 |     // then we expect end of stream
 97 |     BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0);
 98 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN);
 99 | }
100 | 
101 | 
102 | BOOST_AUTO_TEST_CASE(whitespace_lex)
103 | {
104 |     const char *input = "\t\t    \t\t";
105 |     taihen_config_lexer ctx;
106 | 
107 |     BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0);
108 | 
109 |     // we should expect comment token
110 |     BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
111 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_COMMENT_TOKEN);
112 | 
113 |     BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0);
114 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN);
115 | }
116 | 
117 | BOOST_AUTO_TEST_CASE(comment_lex)
118 | {
119 |     const char *input = "#this is a comment";
120 |     taihen_config_lexer ctx;
121 | 
122 |     BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0);
123 | 
124 |     // we should expect comment token
125 |     BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
126 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_COMMENT_TOKEN);
127 | 
128 |     BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0);
129 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN);
130 | }
131 | 
132 | BOOST_AUTO_TEST_CASE(path_lex)
133 | {
134 |     const char *input = "this:/is/a/path";
135 |     taihen_config_lexer ctx;
136 | 
137 |     BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0);
138 | 
139 |     // we should expect path token, this isnt valid config syntax
140 |     // but its not lexer job to ensure its correct order
141 |     // it just tokenises the input
142 |     BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
143 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_PATH_TOKEN);
144 | 
145 |     BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0);
146 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN);
147 | }
148 | 
149 | BOOST_AUTO_TEST_CASE(random_lex)
150 | {
151 |     std::random_device seed;
152 |     std::mt19937_64 mt;
153 |     std::vector<unsigned char> line(255);
154 |     taihen_config_lexer ctx;
155 | 
156 |     // seed mt from random device
157 |     mt.seed(seed());
158 | 
159 |     for (auto i = 0; i < 100000; ++i)
160 |     {
161 |         std::generate(line.begin(), std::prev(line.end()), mt);
162 | 
163 |         line[254] = '\0';
164 | 
165 |         BOOST_WARN_GE(taihen_config_init_lexer(&ctx, (char *)(line.data())), 0);
166 | 
167 |         while(1)
168 |         {
169 |             int res = taihen_config_lex(&ctx);
170 | 
171 |             if (res < 0)
172 |             {
173 |                 std::stringstream ss;
174 | 
175 |                 ss << "on generated data: " << std::hex << std::setfill('0');
176 | 
177 |                 std::for_each(line.begin(), line.end(), [&ss](auto& v)
178 |                 {
179 |                     ss << std::setw(2) << static_cast<unsigned>(v);
180 |                 });
181 | 
182 |                 ss << std::endl;
183 | 
184 | 
185 |                 BOOST_TEST_REQUIRE(res >= 0, ss.str());
186 |             }
187 | 
188 |             if (res == 0)
189 |             {
190 |                 break;
191 |             }
192 |         }
193 |     }
194 | 
195 |     BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN);
196 | }
197 | 
198 | BOOST_AUTO_TEST_CASE(long_line_lex)
199 | {
200 |     char line[CONFIG_MAX_LINE_LENGTH+1];
201 |     taihen_config_lexer ctx;
202 | 
203 |     std::memset(line, 'a', sizeof(line));
204 |     line[CONFIG_MAX_LINE_LENGTH] = '\0';
205 | 
206 |     BOOST_REQUIRE_GE(taihen_config_init_lexer(&ctx, line), 0);
207 |     BOOST_REQUIRE_LT(taihen_config_lex(&ctx), 0);
208 | }
209 | 


--------------------------------------------------------------------------------
/test/test_parser.cpp:
--------------------------------------------------------------------------------
 1 | #include <taihen/parser.h>
 2 | 
 3 | #define BOOST_TEST_MODULE parser
 4 | #include <boost/test/unit_test.hpp>
 5 | 
 6 | BOOST_AUTO_TEST_CASE(removed_for_now)
 7 | {
 8 | 
 9 | }
10 | 


--------------------------------------------------------------------------------