├── .gitignore ├── README.md ├── ctok.c ├── demo.py ├── setup.cfg ├── setup.py ├── test_ctok.py ├── v35tokenizer.h ├── v36tokenizer.h ├── v37tokenizer.h └── v38tokenizer.h /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | *.so 3 | *.egg-info 4 | .eggs 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | CPython's tokenizer exposed as a Python class 2 | ============================================= 3 | 4 | ``` 5 | >>> import ctok 6 | >>> tok = ctok.CTok(b"(hello+world)") 7 | >>> for token in tok: print(token) 8 | ... 9 | (7, b'(', (1, 0), (1, 1)) 10 | (1, b'hello', (1, 1), (1, 6)) 11 | (14, b'+', (1, 6), (1, 7)) 12 | (1, b'world', (1, 7), (1, 12)) 13 | (8, b')', (1, 12), (1, 13)) 14 | >>> 15 | ``` 16 | 17 | TODO 18 | ---- 19 | 20 | - Support reading from a file/stream 21 | - Support str instead of (or in addition to) bytes? 22 | -------------------------------------------------------------------------------- /ctok.c: -------------------------------------------------------------------------------- 1 | #define PY_SSIZE_T_CLEAN 2 | #include 3 | #include 4 | 5 | #if PY_MAJOR_VERSION == 3 6 | # if PY_MINOR_VERSION == 5 7 | # include "v35tokenizer.h" 8 | # elif PY_MINOR_VERSION == 6 9 | # include "v36tokenizer.h" 10 | # elif PY_MINOR_VERSION == 7 11 | # include "v37tokenizer.h" 12 | # elif PY_MINOR_VERSION >= 8 13 | # include "v38tokenizer.h" 14 | # else 15 | # error "Only Python 3.5 and higher are supported" 16 | # endif 17 | #else 18 | # error "Python 2 is not supported" 19 | #endif 20 | 21 | typedef struct { 22 | PyObject_HEAD 23 | struct tok_state *tok; 24 | } CTokObject; 25 | 26 | static void 27 | CTok_dealloc(CTokObject *self) 28 | { 29 | if (self->tok != NULL) { 30 | PyTokenizer_Free(self->tok); 31 | self->tok = NULL; 32 | } 33 | 34 | Py_TYPE(self)->tp_free((PyObject *) self); 35 | } 36 | 37 | static PyObject * 38 | CTok_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 39 | { 40 | CTokObject *self = (CTokObject *) type->tp_alloc(type, 0); 41 | if (self == NULL) 42 | return NULL; 43 | 44 | self->tok = NULL; 45 | return (PyObject *) self; 46 | } 47 | 48 | static int 49 | CTok_init(CTokObject *self, PyObject *args, PyObject *kwds) 50 | { 51 | static char *kwlist[] = {"input", NULL}; 52 | PyObject *input; 53 | 54 | if (!PyArg_ParseTupleAndKeywords(args, kwds, "S", kwlist, &input)) 55 | return -1; 56 | 57 | char *bytes = PyBytes_AsString(input); 58 | if (bytes == NULL) 59 | return -1; 60 | 61 | self->tok = PyTokenizer_FromString(bytes, 0); 62 | if (self->tok == NULL) 63 | return -1; 64 | 65 | return 0; 66 | } 67 | 68 | static PyObject * 69 | CTok_get_raw(CTokObject *self, PyObject *Py_UNUSED(ignored)) 70 | { 71 | if (self->tok == NULL) { 72 | PyErr_SetString(PyExc_ValueError, "Uninitalized tokenizer"); 73 | return NULL; 74 | } 75 | 76 | char *start = NULL, *end = NULL; 77 | int type = PyTokenizer_Get(self->tok, &start, &end); 78 | int istart = -1, iend = -1; 79 | if (start != NULL) 80 | istart = start - self->tok->input; 81 | if (end != NULL) 82 | iend = end - self->tok->input; 83 | return Py_BuildValue("(iii)", type, istart, iend); 84 | } 85 | 86 | static PyObject * 87 | CTok_get(CTokObject *self, PyObject *Py_UNUSED(ignored)) 88 | { 89 | if (self->tok == NULL) { 90 | PyErr_SetString(PyExc_ValueError, "Uninitalized tokenizer"); 91 | return NULL; 92 | } 93 | 94 | char *start = NULL, *end = NULL; 95 | int type = PyTokenizer_Get(self->tok, &start, &end); 96 | if (type == ERRORTOKEN) { 97 | PyErr_Format(PyExc_SyntaxError, "error at line %d", self->tok->lineno); 98 | return NULL; 99 | } 100 | if (type == ENDMARKER) { 101 | PyErr_Format(PyExc_StopIteration, "end of input at line %d", self->tok->lineno); 102 | return NULL; 103 | } 104 | 105 | PyObject *value = NULL; 106 | if (start == NULL || end == NULL) { 107 | value = Py_None; 108 | Py_INCREF(value); 109 | } 110 | else { 111 | value = PyBytes_FromStringAndSize(start, end-start); 112 | if (value == NULL) 113 | return NULL; 114 | } 115 | 116 | // After parsetok.c 117 | struct tok_state *tok = self->tok; 118 | #if PY_MINOR_VERSION >= 8 119 | int lineno = type == STRING ? tok->first_lineno : tok->lineno; 120 | const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start; 121 | #else 122 | int lineno = tok->lineno; 123 | const char *line_start = tok->line_start; 124 | #endif 125 | int end_lineno = tok->lineno; 126 | int col_offset = -1, end_col_offset = -1; 127 | if (start != NULL && start >= line_start) 128 | col_offset = start - line_start; 129 | if (end != NULL && end >= tok->line_start) 130 | end_col_offset = end - tok->line_start; 131 | 132 | return Py_BuildValue("(iO(ii)(ii))", type, value, lineno, col_offset, end_lineno, end_col_offset); 133 | } 134 | 135 | static PyObject * 136 | CTok_iter(PyObject *self) 137 | { 138 | Py_INCREF(self); 139 | return self; 140 | } 141 | 142 | static PyObject * 143 | CTok_iternext(PyObject *self) 144 | { 145 | return CTok_get((CTokObject *)self, NULL); 146 | } 147 | 148 | static PyMethodDef CTok_methods[] = { 149 | {"get", (PyCFunction) CTok_get, METH_NOARGS, 150 | "Get the next token\n" 151 | "\n" 152 | "Returns (type, string, (line, col), (endline, endcol))." 153 | }, 154 | {"get_raw", (PyCFunction) CTok_get_raw, METH_NOARGS, 155 | "Get the next token without allocating much\n" 156 | "\n" 157 | "Returns (type, start, end) where start and end point into self.input()." 158 | }, 159 | {NULL} /* Sentinel */ 160 | }; 161 | 162 | static PyObject * 163 | CTok_input(CTokObject *self, void *closure) 164 | { 165 | if (self->tok == NULL) { 166 | PyErr_SetString(PyExc_ValueError, "Uninitalized tokenizer"); 167 | return NULL; 168 | } 169 | 170 | if (self->tok->input == NULL) { 171 | Py_RETURN_NONE; 172 | } 173 | 174 | return PyBytes_FromString(self->tok->input); 175 | } 176 | 177 | static PyObject * 178 | CTok_encoding(CTokObject *self, void *closure) 179 | { 180 | if (self->tok == NULL) { 181 | PyErr_SetString(PyExc_ValueError, "Uninitalized tokenizer"); 182 | return NULL; 183 | } 184 | 185 | if (self->tok->encoding == NULL) { 186 | Py_RETURN_NONE; 187 | } 188 | 189 | return PyUnicode_FromString(self->tok->encoding); 190 | } 191 | 192 | static PyGetSetDef CTok_getsetters[] = { 193 | {"input", (getter) CTok_input, (setter) NULL, 194 | "Input string encoded by the tokenizer (bytes)", NULL}, 195 | {"encoding", (getter) CTok_encoding, (setter) NULL, 196 | "Encoding discovered by the tokenizer", NULL}, 197 | {NULL} /* Sentinel */ 198 | }; 199 | 200 | static PyTypeObject CTokType = { 201 | PyVarObject_HEAD_INIT(NULL, 0) 202 | .tp_name = "ctok.CTok", 203 | .tp_basicsize = sizeof(CTokObject), 204 | .tp_itemsize = 0, 205 | .tp_dealloc = (destructor) CTok_dealloc, 206 | .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, 207 | .tp_doc = "C Tokenizer", 208 | .tp_iter = CTok_iter, 209 | .tp_iternext = CTok_iternext, 210 | .tp_methods = CTok_methods, 211 | .tp_getset = CTok_getsetters, 212 | .tp_init = (initproc) CTok_init, 213 | .tp_new = CTok_new, 214 | }; 215 | 216 | static struct PyModuleDef ctokmodule = { 217 | PyModuleDef_HEAD_INIT, 218 | .m_name = "ctok", 219 | .m_doc = "Expose CPython's tokenizer as a Python class", 220 | }; 221 | 222 | PyMODINIT_FUNC 223 | PyInit_ctok(void) 224 | { 225 | if (PyType_Ready(&CTokType) < 0) 226 | return NULL; 227 | 228 | PyObject *m = PyModule_Create(&ctokmodule); 229 | if (m == NULL) 230 | return NULL; 231 | 232 | Py_INCREF(&CTokType); 233 | PyModule_AddObject(m, "CTok", (PyObject *) &CTokType); 234 | 235 | return m; 236 | } 237 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | # Reproduce the code in README.md. 2 | from token import * 3 | 4 | import ctok 5 | 6 | tok = ctok.CTok(b"(hello+world)") 7 | for token in tok: 8 | print(token) 9 | 10 | print("Raw:") 11 | tok = ctok.CTok(b"(hello+world)") 12 | while True: 13 | token = tok.get_raw() 14 | print(token) 15 | if token[0] in (ENDMARKER, ERRORTOKEN): 16 | break 17 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension 2 | 3 | mods = [Extension('ctok', sources = ['ctok.c'])] 4 | 5 | setup( 6 | name='ctok', 7 | version='0.0', 8 | description="Expose CPython's tokenizer as a Python class", 9 | ext_modules=mods, 10 | setup_requires=["pytest-runner"], 11 | tests_require=["pytest"], 12 | ) 13 | -------------------------------------------------------------------------------- /test_ctok.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from token import * 3 | 4 | import pytest 5 | 6 | import ctok 7 | 8 | def test_basic(): 9 | input = b"(hello+world)" 10 | tokens = list(ctok.CTok(input)) 11 | assert tokens == [ 12 | (LPAR, b'(', (1, 0), (1, 1)), 13 | (NAME, b'hello', (1, 1), (1, 6)), 14 | (PLUS, b'+', (1, 6), (1, 7)), 15 | (NAME, b'world', (1, 7), (1, 12)), 16 | (RPAR, b')', (1, 12), (1, 13)), 17 | ] 18 | 19 | def test_indent(): 20 | input = b"if 1:\n pass\npass" 21 | tokens = list(ctok.CTok(input)) 22 | assert tokens == [ 23 | (NAME, b'if', (1, 0), (1, 2)), 24 | (NUMBER, b'1', (1, 3), (1, 4)), 25 | (COLON, b':', (1, 4), (1, 5)), 26 | (NEWLINE, b'', (1, 5), (1, 5)), 27 | (INDENT, None, (2, -1), (2, -1)), 28 | (NAME, b'pass', (2, 2), (2, 6)), 29 | (NEWLINE, b'', (2, 6), (2, 6)), 30 | (DEDENT, None, (3, -1), (3, -1)), 31 | (NAME, b'pass', (3, 0), (3, 4)), 32 | ] 33 | 34 | def test_no_indent(): 35 | input = b"(foo\n bar)" 36 | tokens = list(ctok.CTok(input)) 37 | assert tokens == [ 38 | (LPAR, b'(', (1, 0), (1, 1)), 39 | (NAME, b'foo', (1, 1), (1, 4)), 40 | # No NEWLINE, INDENT here! 41 | (NAME, b'bar', (2, 2), (2, 5)), 42 | (RPAR, b')', (2, 5), (2, 6)), 43 | ] 44 | 45 | def test_multi_line_string(): 46 | input = b"'''foo\nbar'''" 47 | tokens = list(ctok.CTok(input)) 48 | if sys.version_info >= (3, 8): 49 | start = (1, 0) 50 | else: 51 | # Older Python versions don't have the correct line number for 52 | # the start of a multi-line string. 53 | start = (2, -1) 54 | assert tokens == [ 55 | (STRING, b"'''foo\nbar'''", start, (2, 6)), 56 | ] 57 | 58 | def test_input_cr(): 59 | input = b"foo\rbar" 60 | tok = ctok.CTok(input) 61 | assert tok.input == b"foo\nbar" 62 | 63 | def test_input_crlf(): 64 | input = b"foo\r\nbar" 65 | tok = ctok.CTok(input) 66 | assert tok.input == b"foo\nbar" 67 | 68 | def test_encoding(): 69 | input = b"# coding: latin-1\nfoo\nbar" 70 | tok = ctok.CTok(input) 71 | assert tok.encoding == "iso-8859-1" 72 | 73 | def test_encoding_default(): 74 | input = b"foo\nbar" 75 | tok = ctok.CTok(input) 76 | assert tok.encoding is None 77 | 78 | def test_get_raw(): 79 | input = b"foo bar\r\nbaz" 80 | tok = ctok.CTok(input) 81 | assert tok.get_raw() == (NAME, 0, 3) 82 | assert tok.get_raw() == (NAME, 4, 7) 83 | assert tok.get_raw() == (NEWLINE, 7, 7) 84 | assert tok.get_raw() == (NAME, 8, 11) 85 | assert tok.get_raw() == (ENDMARKER, -1, -1) 86 | 87 | def test_endmarker(): 88 | input = b"foo\nbar\n" 89 | tok = ctok.CTok(input) 90 | tok.get() 91 | tok.get() 92 | assert tok.get() == (NAME, b"bar", (2, 0), (2, 3)) 93 | assert tok.get() == (NEWLINE, b"", (2, 3), (2, 3)) 94 | with pytest.raises(StopIteration) as excinfo: 95 | tok.get() 96 | assert "end of input at line 2" in str(excinfo.value) 97 | 98 | def test_error(): 99 | input = b"foo\n'bar" 100 | tok = ctok.CTok(input) 101 | assert tok.get() == (NAME, b"foo", (1, 0), (1, 3)) 102 | assert tok.get()[0] == NEWLINE 103 | with pytest.raises(SyntaxError) as excinfo: 104 | tok.get() 105 | assert "error at line 2" in str(excinfo.value) 106 | -------------------------------------------------------------------------------- /v35tokenizer.h: -------------------------------------------------------------------------------- 1 | #ifndef Py_TOKENIZER_H 2 | #define Py_TOKENIZER_H 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | #include "object.h" 8 | 9 | /* Tokenizer interface */ 10 | 11 | #include "token.h" /* For token types */ 12 | 13 | #define MAXINDENT 100 /* Max indentation level */ 14 | 15 | enum decoding_state { 16 | STATE_INIT, 17 | STATE_RAW, 18 | STATE_NORMAL /* have a codec associated with input */ 19 | }; 20 | 21 | /* Tokenizer state */ 22 | struct tok_state { 23 | /* Input state; buf <= cur <= inp <= end */ 24 | /* NB an entire line is held in the buffer */ 25 | char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL */ 26 | char *cur; /* Next character in buffer */ 27 | char *inp; /* End of data in buffer */ 28 | char *end; /* End of input buffer if buf != NULL */ 29 | char *start; /* Start of current token if not NULL */ 30 | int done; /* E_OK normally, E_EOF at EOF, otherwise error code */ 31 | /* NB If done != E_OK, cur must be == inp!!! */ 32 | FILE *fp; /* Rest of input; NULL if tokenizing a string */ 33 | int tabsize; /* Tab spacing */ 34 | int indent; /* Current indentation index */ 35 | int indstack[MAXINDENT]; /* Stack of indents */ 36 | int atbol; /* Nonzero if at begin of new line */ 37 | int pendin; /* Pending indents (if > 0) or dedents (if < 0) */ 38 | const char *prompt, *nextprompt; /* For interactive prompting */ 39 | int lineno; /* Current line number */ 40 | int level; /* () [] {} Parentheses nesting level */ 41 | /* Used to allow free continuations inside them */ 42 | /* Stuff for checking on different tab sizes */ 43 | #ifndef PGEN 44 | /* pgen doesn't have access to Python codecs, it cannot decode the input 45 | filename. The bytes filename might be kept, but it is only used by 46 | indenterror() and it is not really needed: pgen only compiles one file 47 | (Grammar/Grammar). */ 48 | PyObject *filename; 49 | #endif 50 | int altwarning; /* Issue warning if alternate tabs don't match */ 51 | int alterror; /* Issue error if alternate tabs don't match */ 52 | int alttabsize; /* Alternate tab spacing */ 53 | int altindstack[MAXINDENT]; /* Stack of alternate indents */ 54 | /* Stuff for PEP 0263 */ 55 | enum decoding_state decoding_state; 56 | int decoding_erred; /* whether erred in decoding */ 57 | int read_coding_spec; /* whether 'coding:...' has been read */ 58 | char *encoding; /* Source encoding. */ 59 | int cont_line; /* whether we are in a continuation line. */ 60 | const char* line_start; /* pointer to start of current line */ 61 | #ifndef PGEN 62 | PyObject *decoding_readline; /* open(...).readline */ 63 | PyObject *decoding_buffer; 64 | #endif 65 | const char* enc; /* Encoding for the current str. */ 66 | const char* str; 67 | const char* input; /* Tokenizer's newline translated copy of the string. */ 68 | 69 | /* async/await related fields; can be removed in 3.7 when async and await 70 | become normal keywords. */ 71 | int async_def; /* =1 if tokens are inside an 'async def' body. */ 72 | int async_def_indent; /* Indentation level of the outermost 'async def'. */ 73 | int async_def_nl; /* =1 if the outermost 'async def' had at least one 74 | NEWLINE token after it. */ 75 | }; 76 | 77 | extern struct tok_state *PyTokenizer_FromString(const char *, int); 78 | extern struct tok_state *PyTokenizer_FromUTF8(const char *, int); 79 | extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*, 80 | const char *, const char *); 81 | extern void PyTokenizer_Free(struct tok_state *); 82 | extern int PyTokenizer_Get(struct tok_state *, char **, char **); 83 | extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok, 84 | int len, int *offset); 85 | 86 | #ifdef __cplusplus 87 | } 88 | #endif 89 | #endif /* !Py_TOKENIZER_H */ 90 | -------------------------------------------------------------------------------- /v36tokenizer.h: -------------------------------------------------------------------------------- 1 | #ifndef Py_TOKENIZER_H 2 | #define Py_TOKENIZER_H 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | #include "object.h" 8 | 9 | /* Tokenizer interface */ 10 | 11 | #include "token.h" /* For token types */ 12 | 13 | #define MAXINDENT 100 /* Max indentation level */ 14 | 15 | enum decoding_state { 16 | STATE_INIT, 17 | STATE_RAW, 18 | STATE_NORMAL /* have a codec associated with input */ 19 | }; 20 | 21 | /* Tokenizer state */ 22 | struct tok_state { 23 | /* Input state; buf <= cur <= inp <= end */ 24 | /* NB an entire line is held in the buffer */ 25 | char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL */ 26 | char *cur; /* Next character in buffer */ 27 | char *inp; /* End of data in buffer */ 28 | char *end; /* End of input buffer if buf != NULL */ 29 | char *start; /* Start of current token if not NULL */ 30 | int done; /* E_OK normally, E_EOF at EOF, otherwise error code */ 31 | /* NB If done != E_OK, cur must be == inp!!! */ 32 | FILE *fp; /* Rest of input; NULL if tokenizing a string */ 33 | int tabsize; /* Tab spacing */ 34 | int indent; /* Current indentation index */ 35 | int indstack[MAXINDENT]; /* Stack of indents */ 36 | int atbol; /* Nonzero if at begin of new line */ 37 | int pendin; /* Pending indents (if > 0) or dedents (if < 0) */ 38 | const char *prompt, *nextprompt; /* For interactive prompting */ 39 | int lineno; /* Current line number */ 40 | int level; /* () [] {} Parentheses nesting level */ 41 | /* Used to allow free continuations inside them */ 42 | /* Stuff for checking on different tab sizes */ 43 | #ifndef PGEN 44 | /* pgen doesn't have access to Python codecs, it cannot decode the input 45 | filename. The bytes filename might be kept, but it is only used by 46 | indenterror() and it is not really needed: pgen only compiles one file 47 | (Grammar/Grammar). */ 48 | PyObject *filename; 49 | #endif 50 | int altwarning; /* Issue warning if alternate tabs don't match */ 51 | int alterror; /* Issue error if alternate tabs don't match */ 52 | int alttabsize; /* Alternate tab spacing */ 53 | int altindstack[MAXINDENT]; /* Stack of alternate indents */ 54 | /* Stuff for PEP 0263 */ 55 | enum decoding_state decoding_state; 56 | int decoding_erred; /* whether erred in decoding */ 57 | int read_coding_spec; /* whether 'coding:...' has been read */ 58 | char *encoding; /* Source encoding. */ 59 | int cont_line; /* whether we are in a continuation line. */ 60 | const char* line_start; /* pointer to start of current line */ 61 | #ifndef PGEN 62 | PyObject *decoding_readline; /* open(...).readline */ 63 | PyObject *decoding_buffer; 64 | #endif 65 | const char* enc; /* Encoding for the current str. */ 66 | const char* str; 67 | const char* input; /* Tokenizer's newline translated copy of the string. */ 68 | 69 | /* async/await related fields; can be removed in 3.7 when async and await 70 | become normal keywords. */ 71 | int async_def; /* =1 if tokens are inside an 'async def' body. */ 72 | int async_def_indent; /* Indentation level of the outermost 'async def'. */ 73 | int async_def_nl; /* =1 if the outermost 'async def' had at least one 74 | NEWLINE token after it. */ 75 | }; 76 | 77 | extern struct tok_state *PyTokenizer_FromString(const char *, int); 78 | extern struct tok_state *PyTokenizer_FromUTF8(const char *, int); 79 | extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*, 80 | const char *, const char *); 81 | extern void PyTokenizer_Free(struct tok_state *); 82 | extern int PyTokenizer_Get(struct tok_state *, char **, char **); 83 | extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok, 84 | int len, int *offset); 85 | 86 | #ifdef __cplusplus 87 | } 88 | #endif 89 | #endif /* !Py_TOKENIZER_H */ 90 | -------------------------------------------------------------------------------- /v37tokenizer.h: -------------------------------------------------------------------------------- 1 | #ifndef Py_TOKENIZER_H 2 | #define Py_TOKENIZER_H 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | #include "object.h" 8 | 9 | /* Tokenizer interface */ 10 | 11 | #include "token.h" /* For token types */ 12 | 13 | #define MAXINDENT 100 /* Max indentation level */ 14 | 15 | enum decoding_state { 16 | STATE_INIT, 17 | STATE_RAW, 18 | STATE_NORMAL /* have a codec associated with input */ 19 | }; 20 | 21 | /* Tokenizer state */ 22 | struct tok_state { 23 | /* Input state; buf <= cur <= inp <= end */ 24 | /* NB an entire line is held in the buffer */ 25 | char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL */ 26 | char *cur; /* Next character in buffer */ 27 | char *inp; /* End of data in buffer */ 28 | char *end; /* End of input buffer if buf != NULL */ 29 | char *start; /* Start of current token if not NULL */ 30 | int done; /* E_OK normally, E_EOF at EOF, otherwise error code */ 31 | /* NB If done != E_OK, cur must be == inp!!! */ 32 | FILE *fp; /* Rest of input; NULL if tokenizing a string */ 33 | int tabsize; /* Tab spacing */ 34 | int indent; /* Current indentation index */ 35 | int indstack[MAXINDENT]; /* Stack of indents */ 36 | int atbol; /* Nonzero if at begin of new line */ 37 | int pendin; /* Pending indents (if > 0) or dedents (if < 0) */ 38 | const char *prompt, *nextprompt; /* For interactive prompting */ 39 | int lineno; /* Current line number */ 40 | int level; /* () [] {} Parentheses nesting level */ 41 | /* Used to allow free continuations inside them */ 42 | /* Stuff for checking on different tab sizes */ 43 | #ifndef PGEN 44 | /* pgen doesn't have access to Python codecs, it cannot decode the input 45 | filename. The bytes filename might be kept, but it is only used by 46 | indenterror() and it is not really needed: pgen only compiles one file 47 | (Grammar/Grammar). */ 48 | PyObject *filename; 49 | #endif 50 | int altindstack[MAXINDENT]; /* Stack of alternate indents */ 51 | /* Stuff for PEP 0263 */ 52 | enum decoding_state decoding_state; 53 | int decoding_erred; /* whether erred in decoding */ 54 | int read_coding_spec; /* whether 'coding:...' has been read */ 55 | char *encoding; /* Source encoding. */ 56 | int cont_line; /* whether we are in a continuation line. */ 57 | const char* line_start; /* pointer to start of current line */ 58 | #ifndef PGEN 59 | PyObject *decoding_readline; /* open(...).readline */ 60 | PyObject *decoding_buffer; 61 | #endif 62 | const char* enc; /* Encoding for the current str. */ 63 | const char* str; 64 | const char* input; /* Tokenizer's newline translated copy of the string. */ 65 | }; 66 | 67 | extern struct tok_state *PyTokenizer_FromString(const char *, int); 68 | extern struct tok_state *PyTokenizer_FromUTF8(const char *, int); 69 | extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*, 70 | const char *, const char *); 71 | extern void PyTokenizer_Free(struct tok_state *); 72 | extern int PyTokenizer_Get(struct tok_state *, char **, char **); 73 | 74 | #ifdef __cplusplus 75 | } 76 | #endif 77 | #endif /* !Py_TOKENIZER_H */ 78 | -------------------------------------------------------------------------------- /v38tokenizer.h: -------------------------------------------------------------------------------- 1 | #ifndef Py_TOKENIZER_H 2 | #define Py_TOKENIZER_H 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | #include "object.h" 8 | 9 | /* Tokenizer interface */ 10 | 11 | #include "token.h" /* For token types */ 12 | 13 | #define MAXINDENT 100 /* Max indentation level */ 14 | #define MAXLEVEL 200 /* Max parentheses level */ 15 | 16 | enum decoding_state { 17 | STATE_INIT, 18 | STATE_RAW, 19 | STATE_NORMAL /* have a codec associated with input */ 20 | }; 21 | 22 | /* Tokenizer state */ 23 | struct tok_state { 24 | /* Input state; buf <= cur <= inp <= end */ 25 | /* NB an entire line is held in the buffer */ 26 | char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL */ 27 | char *cur; /* Next character in buffer */ 28 | char *inp; /* End of data in buffer */ 29 | char *end; /* End of input buffer if buf != NULL */ 30 | char *start; /* Start of current token if not NULL */ 31 | int done; /* E_OK normally, E_EOF at EOF, otherwise error code */ 32 | /* NB If done != E_OK, cur must be == inp!!! */ 33 | FILE *fp; /* Rest of input; NULL if tokenizing a string */ 34 | int tabsize; /* Tab spacing */ 35 | int indent; /* Current indentation index */ 36 | int indstack[MAXINDENT]; /* Stack of indents */ 37 | int atbol; /* Nonzero if at begin of new line */ 38 | int pendin; /* Pending indents (if > 0) or dedents (if < 0) */ 39 | const char *prompt, *nextprompt; /* For interactive prompting */ 40 | int lineno; /* Current line number */ 41 | int first_lineno; /* First line of a single line or multi line string 42 | expression (cf. issue 16806) */ 43 | int level; /* () [] {} Parentheses nesting level */ 44 | /* Used to allow free continuations inside them */ 45 | char parenstack[MAXLEVEL]; 46 | int parenlinenostack[MAXLEVEL]; 47 | PyObject *filename; 48 | /* Stuff for checking on different tab sizes */ 49 | int altindstack[MAXINDENT]; /* Stack of alternate indents */ 50 | /* Stuff for PEP 0263 */ 51 | enum decoding_state decoding_state; 52 | int decoding_erred; /* whether erred in decoding */ 53 | int read_coding_spec; /* whether 'coding:...' has been read */ 54 | char *encoding; /* Source encoding. */ 55 | int cont_line; /* whether we are in a continuation line. */ 56 | const char* line_start; /* pointer to start of current line */ 57 | const char* multi_line_start; /* pointer to start of first line of 58 | a single line or multi line string 59 | expression (cf. issue 16806) */ 60 | PyObject *decoding_readline; /* open(...).readline */ 61 | PyObject *decoding_buffer; 62 | const char* enc; /* Encoding for the current str. */ 63 | const char* str; 64 | const char* input; /* Tokenizer's newline translated copy of the string. */ 65 | 66 | int type_comments; /* Whether to look for type comments */ 67 | 68 | /* async/await related fields (still needed depending on feature_version) */ 69 | int async_hacks; /* =1 if async/await aren't always keywords */ 70 | int async_def; /* =1 if tokens are inside an 'async def' body. */ 71 | int async_def_indent; /* Indentation level of the outermost 'async def'. */ 72 | int async_def_nl; /* =1 if the outermost 'async def' had at least one 73 | NEWLINE token after it. */ 74 | }; 75 | 76 | extern struct tok_state *PyTokenizer_FromString(const char *, int); 77 | extern struct tok_state *PyTokenizer_FromUTF8(const char *, int); 78 | extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*, 79 | const char *, const char *); 80 | extern void PyTokenizer_Free(struct tok_state *); 81 | extern int PyTokenizer_Get(struct tok_state *, char **, char **); 82 | 83 | #define tok_dump _Py_tok_dump 84 | 85 | #ifdef __cplusplus 86 | } 87 | #endif 88 | #endif /* !Py_TOKENIZER_H */ 89 | --------------------------------------------------------------------------------