├── test └── test.c ├── LICENSE ├── README.md └── lexer.h /test/test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(){ 5 | results_t res = lexer("numchars \"hello, world!\" == 7 + 5",4,"IDENTIFIER","[a-zA-Z]+","STRING","\"[a-zA-Z,! ]*\"","OPERATOR","\\+|==","NUM","[0-9]+"); 6 | printf("%d TOKENS RETURNED\n", res.ntoks); 7 | for(int i = 0 ; i < res.ntoks ; i++ ){ 8 | printf("TYPE : %s | TEXT : %s\n",res.toks[i].type, res.toks[i].str); 9 | } 10 | return 0; 11 | } 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Lorca Heeney 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # regex-lexer (Sep 2018) ![](https://img.shields.io/badge/C-GCC-brightgreen.svg) 2 | 3 | ![](https://benjam.info/blog/posts/2019-09-18-python-deep-dive-tokenizer/tokenizer-abstract.png) 4 | *Image credit: https://benjam.info/blog* 5 | 6 | # Overview 7 | A regular-expressions based lexer written in C. Easily include in any project via a single header file with no external dependicies. Aiming to make interpreters and compilers easier to write by handling the boring stuff for you. 8 | 9 | 10 | # How to use 11 | An small example of using the lexer is included in [test/test.c](test/test.c). For more detailed instructions... 12 | ## 1) Linking 13 | Clone the repository and move the single `lexer.h` file into your project's documentary. Ensure to tell the compiler where to look for local header files (e.g using the `-I` flag in GCC). No other dependencies are needed except the standard library's `stdlib`, `stdbool`, `stdarg`, `string` and `regex` header files. 14 | 15 | ## 2) Calling the lexer 16 | 17 | The main functionality of the library can be accessed using the `lexer` function whose signature is shown below: 18 | ```c 19 | results_t lexer(char * source, unsigned int npatterns, ...); 20 | ``` 21 | The first argument is the text to be tokenized and the second the number of patterns that are to be passed in. Each pattern is passed in using two arguments, its name and POSIX regex pattern, both as strings, in that order. 22 | 23 | An example use of the lexer function to tokenize a simple piece of text with only a few patterns is shown below. 24 | ```c 25 | lexer("numchars \"hello, world!\" == 7 + 5",4,"IDENTIFIER","[a-zA-Z]+","STRING","\"[a-zA-Z,!]*\"","OPERATOR","\\+|==","NUM","[0-9]+"); 26 | ``` 27 | 28 | ## 3) Reading the results 29 | The `lexer` function returns an instance of the `results_t` struct, which contains a list of `token_t` types. Both structure types are detailed below: 30 | ```c 31 | typedef struct { 32 | token_t * toks; 33 | unsigned int ntoks; 34 | } results_t; 35 | 36 | typedef struct { 37 | char * type; 38 | char * str; 39 | } token_t; 40 | ``` 41 | 42 | Below shows an example of using this data to print a formatted list of tokens: 43 | ```c 44 | #include 45 | ... 46 | results_t res = lexer("numchars \"hello, world!\" == 7 + 5",4,"IDENTIFIER","[a-zA-Z]+","STRING","\"[a-zA-Z,!]*\"","OPERATOR","\\+|==","NUM","[0-9]+"); 47 | ... 48 | printf("%d TOKENS RETURNED\n", res.ntoks); 49 | for(int i = 0 ; i < res.ntoks ; i++ ){ 50 | printf("TYPE : %s | TEXT : %s\n",res.toks[i].type, res.toks[i].str); 51 | } 52 | ... 53 | ``` 54 | Output: 55 | ``` 56 | 6 TOKENS RETURNED 57 | TYPE : IDENTIFIER | TEXT : numchars 58 | TYPE : STRING | TEXT : "hello,world!" 59 | TYPE : OPERATOR | TEXT : == 60 | TYPE : NUM | TEXT : 7 61 | TYPE : OPERATOR | TEXT : + 62 | TYPE : NUM | TEXT : 5 63 | ``` 64 | -------------------------------------------------------------------------------- /lexer.h: -------------------------------------------------------------------------------- 1 | // -- HEADER GUARDS -- 2 | #ifndef LEXER_H 3 | #define LEXER_H 4 | // -- INCLUDES -- 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | // -- MACROS -- 11 | #define MAX_NUM_MATCHES 6 12 | #define MAX_NUM_PATTERNS 16 13 | #define MAX_NUM_TOKENS 128 14 | #define MAX_SOURCE_SIZE 512 15 | // -- DATA STRUCTURES -- 16 | typedef struct { 17 | char * name; 18 | char * regex; 19 | } pattern_t; 20 | 21 | typedef struct { 22 | char * type; 23 | char * str; 24 | } token_t; 25 | 26 | struct state_t { 27 | regex_t * obj; 28 | pattern_t * patterns; 29 | unsigned int npatterns; 30 | token_t * tokens; 31 | unsigned int ntokens; 32 | char * buffer; 33 | unsigned int buffsize; 34 | } * state; 35 | 36 | typedef struct { 37 | token_t * toks; 38 | unsigned int ntoks; 39 | } results_t; 40 | // -- PROTOTYPES -- 41 | results_t lexer(char * source, unsigned int npatterns, ...); 42 | static void tokenize(char * source); 43 | static void addtoken(char * tokenpattern, char * tokenname); 44 | static void releasetoken(const char * typename); 45 | static void incbuffer(const char c); 46 | static void decbuffer(void); 47 | static void resetbuffer(void); 48 | static bool isvalid(const char * pattern, const char * target); 49 | static void initstateconfig(bool new); 50 | // -- IMPLEMENTATION 51 | results_t lexer(char * source, unsigned int npatterns, ...){ 52 | initstateconfig(true); 53 | va_list args; 54 | va_start(args, npatterns); 55 | for(int i = 0 ; i < npatterns; i++){ 56 | addtoken(va_arg(args, char *), va_arg(args, char *)); 57 | } 58 | va_end(args); 59 | tokenize(source); 60 | results_t retval = {.toks = state->tokens, .ntoks = state->ntokens}; 61 | initstateconfig(false); 62 | return retval; 63 | } 64 | 65 | static void tokenize(char * source){ 66 | if (strlen(source) > MAX_SOURCE_SIZE){ 67 | return; 68 | } 69 | for(int i = 0 ; i < strlen(source) ; i++){ 70 | if(source[i] == ' '){ 71 | continue; 72 | } 73 | incbuffer(source[i]); 74 | bool success = false; 75 | char * recentname = NULL; 76 | for(int j = 0 ; j < state->npatterns; j++){ 77 | if(isvalid(state->patterns[j].regex, state->buffer)){ 78 | recentname = state->patterns[j].name; 79 | success = true; 80 | break; 81 | } 82 | } 83 | while(success){ 84 | if(i == strlen(source) - 1){ 85 | if(recentname != NULL){ 86 | releasetoken(recentname); 87 | } 88 | break; 89 | } 90 | incbuffer(source[++i]); 91 | success = false; 92 | for(int j = 0 ; j < state->npatterns ; j++){ 93 | if(isvalid(state->patterns[j].regex, state->buffer)){ 94 | recentname = state->patterns[j].name; 95 | success = true; 96 | } 97 | } 98 | if(!success){ 99 | i--; 100 | decbuffer(); 101 | releasetoken(recentname); 102 | resetbuffer(); 103 | break; 104 | } 105 | } 106 | } 107 | } 108 | 109 | static void addtoken(char * tokenpattern, char * tokenname){ 110 | if(state->ntokens < MAX_NUM_TOKENS){ 111 | pattern_t tmp = {.name = tokenname, .regex = tokenpattern}; 112 | state->patterns[state->npatterns++] = tmp; 113 | } 114 | } 115 | 116 | static void releasetoken(const char * typename){ 117 | if(state->ntokens < MAX_NUM_TOKENS){ 118 | token_t tmp = {.type = (char*)typename, .str = state->buffer}; 119 | state->tokens[state->ntokens++] = tmp; 120 | } 121 | } 122 | 123 | static void incbuffer(const char c){ 124 | state->buffer = realloc(state->buffer, (state->buffsize+2)); 125 | state->buffer[state->buffsize++] = c; 126 | state->buffer[state->buffsize] = '\0'; 127 | } 128 | 129 | static void decbuffer(void){ 130 | if(state->buffsize != 0){ 131 | state->buffer = realloc(state->buffer, (state->buffsize-1)); 132 | state->buffsize--; 133 | } 134 | } 135 | 136 | static void resetbuffer(void){ 137 | state->buffer = malloc(1); 138 | state->buffsize = 0; 139 | } 140 | 141 | static bool isvalid(const char * pattern, const char * target){ 142 | regcomp(state->obj, pattern, REG_EXTENDED); 143 | regmatch_t * matches = malloc(sizeof(regmatch_t) * MAX_NUM_MATCHES); 144 | if(regexec(state->obj, target, MAX_NUM_MATCHES, matches, 0) != 0){ 145 | return false; 146 | } 147 | const unsigned int len = strlen(target); 148 | for( int i = 0 ; i < MAX_NUM_MATCHES ; i++){ 149 | if(matches[i].rm_so == 0 && matches[i].rm_eo == len && matches[i].rm_eo != 0){ 150 | return true; 151 | } else if (matches[i].rm_so == -1){ 152 | break; 153 | } 154 | } 155 | return false; 156 | } 157 | 158 | static void initstateconfig(bool new){ 159 | if(new){ 160 | state = malloc(sizeof(struct state_t)); 161 | state->obj = malloc(sizeof(regex_t)); 162 | state->patterns = malloc(sizeof(pattern_t) * MAX_NUM_PATTERNS); 163 | state->tokens = malloc(sizeof(token_t) * MAX_NUM_TOKENS); 164 | state->buffer = malloc(1); 165 | } 166 | } 167 | #endif 168 | --------------------------------------------------------------------------------