├── .gitignore
├── README.md
├── ctok.c
├── demo.py
├── setup.cfg
├── setup.py
├── test_ctok.py
├── v35tokenizer.h
├── v36tokenizer.h
├── v37tokenizer.h
└── v38tokenizer.h


/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | *.so
3 | *.egg-info
4 | .eggs
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | CPython's tokenizer exposed as a Python class
 2 | =============================================
 3 | 
 4 | ```
 5 | >>> import ctok
 6 | >>> tok = ctok.CTok(b"(hello+world)")
 7 | >>> for token in tok: print(token)
 8 | ...
 9 | (7, b'(', (1, 0), (1, 1))
10 | (1, b'hello', (1, 1), (1, 6))
11 | (14, b'+', (1, 6), (1, 7))
12 | (1, b'world', (1, 7), (1, 12))
13 | (8, b')', (1, 12), (1, 13))
14 | >>>
15 | ```
16 | 
17 | TODO
18 | ----
19 | 
20 | - Support reading from a file/stream
21 | - Support str instead of (or in addition to) bytes?
22 | 


--------------------------------------------------------------------------------
/ctok.c:
--------------------------------------------------------------------------------
  1 | #define PY_SSIZE_T_CLEAN
  2 | #include <Python.h>
  3 | #include <token.h>
  4 | 
  5 | #if PY_MAJOR_VERSION == 3
  6 | #  if PY_MINOR_VERSION == 5
  7 | #    include "v35tokenizer.h"
  8 | #  elif PY_MINOR_VERSION == 6
  9 | #    include "v36tokenizer.h"
 10 | #  elif PY_MINOR_VERSION == 7
 11 | #    include "v37tokenizer.h"
 12 | #  elif PY_MINOR_VERSION >= 8
 13 | #    include "v38tokenizer.h"
 14 | #  else
 15 | #    error "Only Python 3.5 and higher are supported"
 16 | #  endif
 17 | #else
 18 | #  error "Python 2 is not supported"
 19 | #endif
 20 | 
 21 | typedef struct {
 22 |     PyObject_HEAD
 23 |     struct tok_state *tok;
 24 | } CTokObject;
 25 | 
 26 | static void
 27 | CTok_dealloc(CTokObject *self)
 28 | {
 29 |     if (self->tok != NULL) {
 30 |         PyTokenizer_Free(self->tok);
 31 |         self->tok = NULL;
 32 |     }
 33 | 
 34 |     Py_TYPE(self)->tp_free((PyObject *) self);
 35 | }
 36 | 
 37 | static PyObject *
 38 | CTok_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 39 | {
 40 |     CTokObject *self = (CTokObject *) type->tp_alloc(type, 0);
 41 |     if (self == NULL)
 42 |         return NULL;
 43 | 
 44 |     self->tok = NULL;
 45 |     return (PyObject *) self;
 46 | }
 47 | 
 48 | static int
 49 | CTok_init(CTokObject *self, PyObject *args, PyObject *kwds)
 50 | {
 51 |     static char *kwlist[] = {"input", NULL};
 52 |     PyObject *input;
 53 | 
 54 |     if (!PyArg_ParseTupleAndKeywords(args, kwds, "S", kwlist, &input))
 55 |         return -1;
 56 | 
 57 |     char *bytes = PyBytes_AsString(input);
 58 |     if (bytes == NULL)
 59 |         return -1;
 60 | 
 61 |     self->tok = PyTokenizer_FromString(bytes, 0);
 62 |     if (self->tok == NULL)
 63 |         return -1;
 64 | 
 65 |     return 0;
 66 | }
 67 | 
 68 | static PyObject *
 69 | CTok_get_raw(CTokObject *self, PyObject *Py_UNUSED(ignored))
 70 | {
 71 |     if (self->tok == NULL) {
 72 |         PyErr_SetString(PyExc_ValueError, "Uninitalized tokenizer");
 73 |         return NULL;
 74 |     }
 75 | 
 76 |     char *start = NULL, *end = NULL;
 77 |     int type = PyTokenizer_Get(self->tok, &start, &end);
 78 |     int istart = -1, iend = -1;
 79 |     if (start != NULL)
 80 |         istart = start - self->tok->input;
 81 |     if (end != NULL)
 82 |         iend = end - self->tok->input;
 83 |     return Py_BuildValue("(iii)", type, istart, iend);
 84 | }
 85 | 
 86 | static PyObject *
 87 | CTok_get(CTokObject *self, PyObject *Py_UNUSED(ignored))
 88 | {
 89 |     if (self->tok == NULL) {
 90 |         PyErr_SetString(PyExc_ValueError, "Uninitalized tokenizer");
 91 |         return NULL;
 92 |     }
 93 | 
 94 |     char *start = NULL, *end = NULL;
 95 |     int type = PyTokenizer_Get(self->tok, &start, &end);
 96 |     if (type == ERRORTOKEN) {
 97 |         PyErr_Format(PyExc_SyntaxError, "error at line %d", self->tok->lineno);
 98 |         return NULL;
 99 |     }
100 |     if (type == ENDMARKER) {
101 |         PyErr_Format(PyExc_StopIteration, "end of input at line %d", self->tok->lineno);
102 |         return NULL;
103 |     }
104 | 
105 |     PyObject *value = NULL;
106 |     if (start == NULL || end == NULL) {
107 |         value = Py_None;
108 |         Py_INCREF(value);
109 |     }
110 |     else {
111 |         value = PyBytes_FromStringAndSize(start, end-start);
112 |         if (value == NULL)
113 |             return NULL;
114 |     }
115 | 
116 |     // After parsetok.c
117 |     struct tok_state *tok = self->tok;
118 | #if PY_MINOR_VERSION >= 8
119 |     int lineno = type == STRING ? tok->first_lineno : tok->lineno;
120 |     const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start;
121 | #else
122 |     int lineno = tok->lineno;
123 |     const char *line_start = tok->line_start;
124 | #endif
125 |     int end_lineno = tok->lineno;
126 |     int col_offset = -1, end_col_offset = -1;
127 |     if (start != NULL && start >= line_start)
128 |         col_offset = start - line_start;
129 |     if (end != NULL && end >= tok->line_start)
130 |         end_col_offset = end - tok->line_start;
131 | 
132 |     return Py_BuildValue("(iO(ii)(ii))", type, value, lineno, col_offset, end_lineno, end_col_offset);
133 | }
134 | 
135 | static PyObject *
136 | CTok_iter(PyObject *self)
137 | {
138 |     Py_INCREF(self);
139 |     return self;
140 | }
141 | 
142 | static PyObject *
143 | CTok_iternext(PyObject *self)
144 | {
145 |     return CTok_get((CTokObject *)self, NULL);
146 | }
147 | 
148 | static PyMethodDef CTok_methods[] = {
149 |     {"get", (PyCFunction) CTok_get, METH_NOARGS,
150 |      "Get the next token\n"
151 |      "\n"
152 |      "Returns (type, string, (line, col), (endline, endcol))."
153 |     },
154 |     {"get_raw", (PyCFunction) CTok_get_raw, METH_NOARGS,
155 |      "Get the next token without allocating much\n"
156 |      "\n"
157 |      "Returns (type, start, end) where start and end point into self.input()."
158 |     },
159 |     {NULL}  /* Sentinel */
160 | };
161 | 
162 | static PyObject *
163 | CTok_input(CTokObject *self, void *closure)
164 | {
165 |     if (self->tok == NULL) {
166 |         PyErr_SetString(PyExc_ValueError, "Uninitalized tokenizer");
167 |         return NULL;
168 |     }
169 | 
170 |     if (self->tok->input == NULL) {
171 |         Py_RETURN_NONE;
172 |     }
173 | 
174 |     return PyBytes_FromString(self->tok->input);
175 | }
176 | 
177 | static PyObject *
178 | CTok_encoding(CTokObject *self, void *closure)
179 | {
180 |     if (self->tok == NULL) {
181 |         PyErr_SetString(PyExc_ValueError, "Uninitalized tokenizer");
182 |         return NULL;
183 |     }
184 | 
185 |     if (self->tok->encoding == NULL) {
186 |         Py_RETURN_NONE;
187 |     }
188 | 
189 |     return PyUnicode_FromString(self->tok->encoding);
190 | }
191 | 
192 | static PyGetSetDef CTok_getsetters[] = {
193 |     {"input", (getter) CTok_input, (setter) NULL,
194 |      "Input string encoded by the tokenizer (bytes)", NULL},
195 |     {"encoding", (getter) CTok_encoding, (setter) NULL,
196 |      "Encoding discovered by the tokenizer", NULL},
197 |     {NULL}  /* Sentinel */
198 | };
199 | 
200 | static PyTypeObject CTokType = {
201 |     PyVarObject_HEAD_INIT(NULL, 0)
202 |     .tp_name = "ctok.CTok",
203 |     .tp_basicsize = sizeof(CTokObject),
204 |     .tp_itemsize = 0,
205 |     .tp_dealloc = (destructor) CTok_dealloc,
206 |     .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
207 |     .tp_doc = "C Tokenizer",
208 |     .tp_iter = CTok_iter,
209 |     .tp_iternext = CTok_iternext,
210 |     .tp_methods = CTok_methods,
211 |     .tp_getset = CTok_getsetters,
212 |     .tp_init = (initproc) CTok_init,
213 |     .tp_new = CTok_new,
214 | };
215 | 
216 | static struct PyModuleDef ctokmodule = {
217 |     PyModuleDef_HEAD_INIT,
218 |     .m_name = "ctok",
219 |     .m_doc = "Expose CPython's tokenizer as a Python class",
220 | };
221 | 
222 | PyMODINIT_FUNC
223 | PyInit_ctok(void)
224 | {
225 |     if (PyType_Ready(&CTokType) < 0)
226 |         return NULL;
227 | 
228 |     PyObject *m = PyModule_Create(&ctokmodule);
229 |     if (m == NULL)
230 |         return NULL;
231 | 
232 |     Py_INCREF(&CTokType);
233 |     PyModule_AddObject(m, "CTok", (PyObject *) &CTokType);
234 | 
235 |     return m;
236 | }
237 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
 1 | # Reproduce the code in README.md.
 2 | from token import *
 3 | 
 4 | import ctok
 5 | 
 6 | tok = ctok.CTok(b"(hello+world)")
 7 | for token in tok:
 8 |     print(token)
 9 | 
10 | print("Raw:")
11 | tok = ctok.CTok(b"(hello+world)")
12 | while True:
13 |     token = tok.get_raw()
14 |     print(token)
15 |     if token[0] in (ENDMARKER, ERRORTOKEN):
16 |         break
17 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, Extension
 2 | 
 3 | mods = [Extension('ctok', sources = ['ctok.c'])]
 4 | 
 5 | setup(
 6 |     name='ctok',
 7 |     version='0.0',
 8 |     description="Expose CPython's tokenizer as a Python class",
 9 |     ext_modules=mods,
10 |     setup_requires=["pytest-runner"],
11 |     tests_require=["pytest"],
12 | )
13 | 


--------------------------------------------------------------------------------
/test_ctok.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from token import *
  3 | 
  4 | import pytest
  5 | 
  6 | import ctok
  7 | 
  8 | def test_basic():
  9 |     input = b"(hello+world)"
 10 |     tokens = list(ctok.CTok(input))
 11 |     assert tokens == [
 12 |         (LPAR, b'(', (1, 0), (1, 1)),
 13 |         (NAME, b'hello', (1, 1), (1, 6)),
 14 |         (PLUS, b'+', (1, 6), (1, 7)),
 15 |         (NAME, b'world', (1, 7), (1, 12)),
 16 |         (RPAR, b')', (1, 12), (1, 13)),
 17 |     ]
 18 | 
 19 | def test_indent():
 20 |     input = b"if 1:\n  pass\npass"
 21 |     tokens = list(ctok.CTok(input))
 22 |     assert tokens == [
 23 |         (NAME, b'if', (1, 0), (1, 2)),
 24 |         (NUMBER, b'1', (1, 3), (1, 4)),
 25 |         (COLON, b':', (1, 4), (1, 5)),
 26 |         (NEWLINE, b'', (1, 5), (1, 5)),
 27 |         (INDENT, None, (2, -1), (2, -1)),
 28 |         (NAME, b'pass', (2, 2), (2, 6)),
 29 |         (NEWLINE, b'', (2, 6), (2, 6)),
 30 |         (DEDENT, None, (3, -1), (3, -1)),
 31 |         (NAME, b'pass', (3, 0), (3, 4)),
 32 |     ]
 33 | 
 34 | def test_no_indent():
 35 |     input = b"(foo\n  bar)"
 36 |     tokens = list(ctok.CTok(input))
 37 |     assert tokens == [
 38 |         (LPAR, b'(', (1, 0), (1, 1)),
 39 |         (NAME, b'foo', (1, 1), (1, 4)),
 40 |         # No NEWLINE, INDENT here!
 41 |         (NAME, b'bar', (2, 2), (2, 5)),
 42 |         (RPAR, b')', (2, 5), (2, 6)),
 43 |     ]
 44 | 
 45 | def test_multi_line_string():
 46 |     input = b"'''foo\nbar'''"
 47 |     tokens = list(ctok.CTok(input))
 48 |     if sys.version_info >= (3, 8):
 49 |         start = (1, 0)
 50 |     else:
 51 |         # Older Python versions don't have the correct line number for
 52 |         # the start of a multi-line string.
 53 |         start = (2, -1)
 54 |     assert tokens == [
 55 |         (STRING, b"'''foo\nbar'''", start, (2, 6)),
 56 |     ]
 57 | 
 58 | def test_input_cr():
 59 |     input = b"foo\rbar"
 60 |     tok = ctok.CTok(input)
 61 |     assert tok.input == b"foo\nbar"
 62 | 
 63 | def test_input_crlf():
 64 |     input = b"foo\r\nbar"
 65 |     tok = ctok.CTok(input)
 66 |     assert tok.input == b"foo\nbar"
 67 | 
 68 | def test_encoding():
 69 |     input = b"# coding: latin-1\nfoo\nbar"
 70 |     tok = ctok.CTok(input)
 71 |     assert tok.encoding == "iso-8859-1"
 72 | 
 73 | def test_encoding_default():
 74 |     input = b"foo\nbar"
 75 |     tok = ctok.CTok(input)
 76 |     assert tok.encoding is None
 77 | 
 78 | def test_get_raw():
 79 |     input = b"foo bar\r\nbaz"
 80 |     tok = ctok.CTok(input)
 81 |     assert tok.get_raw() == (NAME, 0, 3)
 82 |     assert tok.get_raw() == (NAME, 4, 7)
 83 |     assert tok.get_raw() == (NEWLINE, 7, 7)
 84 |     assert tok.get_raw() == (NAME, 8, 11)
 85 |     assert tok.get_raw() == (ENDMARKER, -1, -1)
 86 | 
 87 | def test_endmarker():
 88 |     input = b"foo\nbar\n"
 89 |     tok = ctok.CTok(input)
 90 |     tok.get()
 91 |     tok.get()
 92 |     assert tok.get() == (NAME, b"bar", (2, 0), (2, 3))
 93 |     assert tok.get() == (NEWLINE, b"", (2, 3), (2, 3))
 94 |     with pytest.raises(StopIteration) as excinfo:
 95 |         tok.get()
 96 |     assert "end of input at line 2" in str(excinfo.value)
 97 | 
 98 | def test_error():
 99 |     input = b"foo\n'bar"
100 |     tok = ctok.CTok(input)
101 |     assert tok.get() == (NAME, b"foo", (1, 0), (1, 3))
102 |     assert tok.get()[0] == NEWLINE
103 |     with pytest.raises(SyntaxError) as excinfo:
104 |         tok.get()
105 |     assert "error at line 2" in str(excinfo.value)
106 | 


--------------------------------------------------------------------------------
/v35tokenizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef Py_TOKENIZER_H
 2 | #define Py_TOKENIZER_H
 3 | #ifdef __cplusplus
 4 | extern "C" {
 5 | #endif
 6 | 
 7 | #include "object.h"
 8 | 
 9 | /* Tokenizer interface */
10 | 
11 | #include "token.h"      /* For token types */
12 | 
13 | #define MAXINDENT 100   /* Max indentation level */
14 | 
15 | enum decoding_state {
16 |     STATE_INIT,
17 |     STATE_RAW,
18 |     STATE_NORMAL        /* have a codec associated with input */
19 | };
20 | 
21 | /* Tokenizer state */
22 | struct tok_state {
23 |     /* Input state; buf <= cur <= inp <= end */
24 |     /* NB an entire line is held in the buffer */
25 |     char *buf;          /* Input buffer, or NULL; malloc'ed if fp != NULL */
26 |     char *cur;          /* Next character in buffer */
27 |     char *inp;          /* End of data in buffer */
28 |     char *end;          /* End of input buffer if buf != NULL */
29 |     char *start;        /* Start of current token if not NULL */
30 |     int done;           /* E_OK normally, E_EOF at EOF, otherwise error code */
31 |     /* NB If done != E_OK, cur must be == inp!!! */
32 |     FILE *fp;           /* Rest of input; NULL if tokenizing a string */
33 |     int tabsize;        /* Tab spacing */
34 |     int indent;         /* Current indentation index */
35 |     int indstack[MAXINDENT];            /* Stack of indents */
36 |     int atbol;          /* Nonzero if at begin of new line */
37 |     int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */
38 |     const char *prompt, *nextprompt;          /* For interactive prompting */
39 |     int lineno;         /* Current line number */
40 |     int level;          /* () [] {} Parentheses nesting level */
41 |             /* Used to allow free continuations inside them */
42 |     /* Stuff for checking on different tab sizes */
43 | #ifndef PGEN
44 |     /* pgen doesn't have access to Python codecs, it cannot decode the input
45 |        filename. The bytes filename might be kept, but it is only used by
46 |        indenterror() and it is not really needed: pgen only compiles one file
47 |        (Grammar/Grammar). */
48 |     PyObject *filename;
49 | #endif
50 |     int altwarning;     /* Issue warning if alternate tabs don't match */
51 |     int alterror;       /* Issue error if alternate tabs don't match */
52 |     int alttabsize;     /* Alternate tab spacing */
53 |     int altindstack[MAXINDENT];         /* Stack of alternate indents */
54 |     /* Stuff for PEP 0263 */
55 |     enum decoding_state decoding_state;
56 |     int decoding_erred;         /* whether erred in decoding  */
57 |     int read_coding_spec;       /* whether 'coding:...' has been read  */
58 |     char *encoding;         /* Source encoding. */
59 |     int cont_line;          /* whether we are in a continuation line. */
60 |     const char* line_start;     /* pointer to start of current line */
61 | #ifndef PGEN
62 |     PyObject *decoding_readline; /* open(...).readline */
63 |     PyObject *decoding_buffer;
64 | #endif
65 |     const char* enc;        /* Encoding for the current str. */
66 |     const char* str;
67 |     const char* input; /* Tokenizer's newline translated copy of the string. */
68 | 
69 |     /* async/await related fields; can be removed in 3.7 when async and await
70 |        become normal keywords. */
71 |     int async_def;        /* =1 if tokens are inside an 'async def' body. */
72 |     int async_def_indent; /* Indentation level of the outermost 'async def'. */
73 |     int async_def_nl;     /* =1 if the outermost 'async def' had at least one
74 |                              NEWLINE token after it. */
75 | };
76 | 
77 | extern struct tok_state *PyTokenizer_FromString(const char *, int);
78 | extern struct tok_state *PyTokenizer_FromUTF8(const char *, int);
79 | extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*,
80 |                                               const char *, const char *);
81 | extern void PyTokenizer_Free(struct tok_state *);
82 | extern int PyTokenizer_Get(struct tok_state *, char **, char **);
83 | extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,
84 |                                           int len, int *offset);
85 | 
86 | #ifdef __cplusplus
87 | }
88 | #endif
89 | #endif /* !Py_TOKENIZER_H */
90 | 


--------------------------------------------------------------------------------
/v36tokenizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef Py_TOKENIZER_H
 2 | #define Py_TOKENIZER_H
 3 | #ifdef __cplusplus
 4 | extern "C" {
 5 | #endif
 6 | 
 7 | #include "object.h"
 8 | 
 9 | /* Tokenizer interface */
10 | 
11 | #include "token.h"      /* For token types */
12 | 
13 | #define MAXINDENT 100   /* Max indentation level */
14 | 
15 | enum decoding_state {
16 |     STATE_INIT,
17 |     STATE_RAW,
18 |     STATE_NORMAL        /* have a codec associated with input */
19 | };
20 | 
21 | /* Tokenizer state */
22 | struct tok_state {
23 |     /* Input state; buf <= cur <= inp <= end */
24 |     /* NB an entire line is held in the buffer */
25 |     char *buf;          /* Input buffer, or NULL; malloc'ed if fp != NULL */
26 |     char *cur;          /* Next character in buffer */
27 |     char *inp;          /* End of data in buffer */
28 |     char *end;          /* End of input buffer if buf != NULL */
29 |     char *start;        /* Start of current token if not NULL */
30 |     int done;           /* E_OK normally, E_EOF at EOF, otherwise error code */
31 |     /* NB If done != E_OK, cur must be == inp!!! */
32 |     FILE *fp;           /* Rest of input; NULL if tokenizing a string */
33 |     int tabsize;        /* Tab spacing */
34 |     int indent;         /* Current indentation index */
35 |     int indstack[MAXINDENT];            /* Stack of indents */
36 |     int atbol;          /* Nonzero if at begin of new line */
37 |     int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */
38 |     const char *prompt, *nextprompt;          /* For interactive prompting */
39 |     int lineno;         /* Current line number */
40 |     int level;          /* () [] {} Parentheses nesting level */
41 |             /* Used to allow free continuations inside them */
42 |     /* Stuff for checking on different tab sizes */
43 | #ifndef PGEN
44 |     /* pgen doesn't have access to Python codecs, it cannot decode the input
45 |        filename. The bytes filename might be kept, but it is only used by
46 |        indenterror() and it is not really needed: pgen only compiles one file
47 |        (Grammar/Grammar). */
48 |     PyObject *filename;
49 | #endif
50 |     int altwarning;     /* Issue warning if alternate tabs don't match */
51 |     int alterror;       /* Issue error if alternate tabs don't match */
52 |     int alttabsize;     /* Alternate tab spacing */
53 |     int altindstack[MAXINDENT];         /* Stack of alternate indents */
54 |     /* Stuff for PEP 0263 */
55 |     enum decoding_state decoding_state;
56 |     int decoding_erred;         /* whether erred in decoding  */
57 |     int read_coding_spec;       /* whether 'coding:...' has been read  */
58 |     char *encoding;         /* Source encoding. */
59 |     int cont_line;          /* whether we are in a continuation line. */
60 |     const char* line_start;     /* pointer to start of current line */
61 | #ifndef PGEN
62 |     PyObject *decoding_readline; /* open(...).readline */
63 |     PyObject *decoding_buffer;
64 | #endif
65 |     const char* enc;        /* Encoding for the current str. */
66 |     const char* str;
67 |     const char* input; /* Tokenizer's newline translated copy of the string. */
68 | 
69 |     /* async/await related fields; can be removed in 3.7 when async and await
70 |        become normal keywords. */
71 |     int async_def;        /* =1 if tokens are inside an 'async def' body. */
72 |     int async_def_indent; /* Indentation level of the outermost 'async def'. */
73 |     int async_def_nl;     /* =1 if the outermost 'async def' had at least one
74 |                              NEWLINE token after it. */
75 | };
76 | 
77 | extern struct tok_state *PyTokenizer_FromString(const char *, int);
78 | extern struct tok_state *PyTokenizer_FromUTF8(const char *, int);
79 | extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*,
80 |                                               const char *, const char *);
81 | extern void PyTokenizer_Free(struct tok_state *);
82 | extern int PyTokenizer_Get(struct tok_state *, char **, char **);
83 | extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,
84 |                                           int len, int *offset);
85 | 
86 | #ifdef __cplusplus
87 | }
88 | #endif
89 | #endif /* !Py_TOKENIZER_H */
90 | 


--------------------------------------------------------------------------------
/v37tokenizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef Py_TOKENIZER_H
 2 | #define Py_TOKENIZER_H
 3 | #ifdef __cplusplus
 4 | extern "C" {
 5 | #endif
 6 | 
 7 | #include "object.h"
 8 | 
 9 | /* Tokenizer interface */
10 | 
11 | #include "token.h"      /* For token types */
12 | 
13 | #define MAXINDENT 100   /* Max indentation level */
14 | 
15 | enum decoding_state {
16 |     STATE_INIT,
17 |     STATE_RAW,
18 |     STATE_NORMAL        /* have a codec associated with input */
19 | };
20 | 
21 | /* Tokenizer state */
22 | struct tok_state {
23 |     /* Input state; buf <= cur <= inp <= end */
24 |     /* NB an entire line is held in the buffer */
25 |     char *buf;          /* Input buffer, or NULL; malloc'ed if fp != NULL */
26 |     char *cur;          /* Next character in buffer */
27 |     char *inp;          /* End of data in buffer */
28 |     char *end;          /* End of input buffer if buf != NULL */
29 |     char *start;        /* Start of current token if not NULL */
30 |     int done;           /* E_OK normally, E_EOF at EOF, otherwise error code */
31 |     /* NB If done != E_OK, cur must be == inp!!! */
32 |     FILE *fp;           /* Rest of input; NULL if tokenizing a string */
33 |     int tabsize;        /* Tab spacing */
34 |     int indent;         /* Current indentation index */
35 |     int indstack[MAXINDENT];            /* Stack of indents */
36 |     int atbol;          /* Nonzero if at begin of new line */
37 |     int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */
38 |     const char *prompt, *nextprompt;          /* For interactive prompting */
39 |     int lineno;         /* Current line number */
40 |     int level;          /* () [] {} Parentheses nesting level */
41 |             /* Used to allow free continuations inside them */
42 |     /* Stuff for checking on different tab sizes */
43 | #ifndef PGEN
44 |     /* pgen doesn't have access to Python codecs, it cannot decode the input
45 |        filename. The bytes filename might be kept, but it is only used by
46 |        indenterror() and it is not really needed: pgen only compiles one file
47 |        (Grammar/Grammar). */
48 |     PyObject *filename;
49 | #endif
50 |     int altindstack[MAXINDENT];         /* Stack of alternate indents */
51 |     /* Stuff for PEP 0263 */
52 |     enum decoding_state decoding_state;
53 |     int decoding_erred;         /* whether erred in decoding  */
54 |     int read_coding_spec;       /* whether 'coding:...' has been read  */
55 |     char *encoding;         /* Source encoding. */
56 |     int cont_line;          /* whether we are in a continuation line. */
57 |     const char* line_start;     /* pointer to start of current line */
58 | #ifndef PGEN
59 |     PyObject *decoding_readline; /* open(...).readline */
60 |     PyObject *decoding_buffer;
61 | #endif
62 |     const char* enc;        /* Encoding for the current str. */
63 |     const char* str;
64 |     const char* input; /* Tokenizer's newline translated copy of the string. */
65 | };
66 | 
67 | extern struct tok_state *PyTokenizer_FromString(const char *, int);
68 | extern struct tok_state *PyTokenizer_FromUTF8(const char *, int);
69 | extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*,
70 |                                               const char *, const char *);
71 | extern void PyTokenizer_Free(struct tok_state *);
72 | extern int PyTokenizer_Get(struct tok_state *, char **, char **);
73 | 
74 | #ifdef __cplusplus
75 | }
76 | #endif
77 | #endif /* !Py_TOKENIZER_H */
78 | 


--------------------------------------------------------------------------------
/v38tokenizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef Py_TOKENIZER_H
 2 | #define Py_TOKENIZER_H
 3 | #ifdef __cplusplus
 4 | extern "C" {
 5 | #endif
 6 | 
 7 | #include "object.h"
 8 | 
 9 | /* Tokenizer interface */
10 | 
11 | #include "token.h"      /* For token types */
12 | 
13 | #define MAXINDENT 100   /* Max indentation level */
14 | #define MAXLEVEL 200    /* Max parentheses level */
15 | 
16 | enum decoding_state {
17 |     STATE_INIT,
18 |     STATE_RAW,
19 |     STATE_NORMAL        /* have a codec associated with input */
20 | };
21 | 
22 | /* Tokenizer state */
23 | struct tok_state {
24 |     /* Input state; buf <= cur <= inp <= end */
25 |     /* NB an entire line is held in the buffer */
26 |     char *buf;          /* Input buffer, or NULL; malloc'ed if fp != NULL */
27 |     char *cur;          /* Next character in buffer */
28 |     char *inp;          /* End of data in buffer */
29 |     char *end;          /* End of input buffer if buf != NULL */
30 |     char *start;        /* Start of current token if not NULL */
31 |     int done;           /* E_OK normally, E_EOF at EOF, otherwise error code */
32 |     /* NB If done != E_OK, cur must be == inp!!! */
33 |     FILE *fp;           /* Rest of input; NULL if tokenizing a string */
34 |     int tabsize;        /* Tab spacing */
35 |     int indent;         /* Current indentation index */
36 |     int indstack[MAXINDENT];            /* Stack of indents */
37 |     int atbol;          /* Nonzero if at begin of new line */
38 |     int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */
39 |     const char *prompt, *nextprompt;          /* For interactive prompting */
40 |     int lineno;         /* Current line number */
41 |     int first_lineno;   /* First line of a single line or multi line string
42 |                            expression (cf. issue 16806) */
43 |     int level;          /* () [] {} Parentheses nesting level */
44 |             /* Used to allow free continuations inside them */
45 |     char parenstack[MAXLEVEL];
46 |     int parenlinenostack[MAXLEVEL];
47 |     PyObject *filename;
48 |     /* Stuff for checking on different tab sizes */
49 |     int altindstack[MAXINDENT];         /* Stack of alternate indents */
50 |     /* Stuff for PEP 0263 */
51 |     enum decoding_state decoding_state;
52 |     int decoding_erred;         /* whether erred in decoding  */
53 |     int read_coding_spec;       /* whether 'coding:...' has been read  */
54 |     char *encoding;         /* Source encoding. */
55 |     int cont_line;          /* whether we are in a continuation line. */
56 |     const char* line_start;     /* pointer to start of current line */
57 |     const char* multi_line_start; /* pointer to start of first line of
58 |                                      a single line or multi line string
59 |                                      expression (cf. issue 16806) */
60 |     PyObject *decoding_readline; /* open(...).readline */
61 |     PyObject *decoding_buffer;
62 |     const char* enc;        /* Encoding for the current str. */
63 |     const char* str;
64 |     const char* input; /* Tokenizer's newline translated copy of the string. */
65 | 
66 |     int type_comments;      /* Whether to look for type comments */
67 | 
68 |     /* async/await related fields (still needed depending on feature_version) */
69 |     int async_hacks;     /* =1 if async/await aren't always keywords */
70 |     int async_def;        /* =1 if tokens are inside an 'async def' body. */
71 |     int async_def_indent; /* Indentation level of the outermost 'async def'. */
72 |     int async_def_nl;     /* =1 if the outermost 'async def' had at least one
73 |                              NEWLINE token after it. */
74 | };
75 | 
76 | extern struct tok_state *PyTokenizer_FromString(const char *, int);
77 | extern struct tok_state *PyTokenizer_FromUTF8(const char *, int);
78 | extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*,
79 |                                               const char *, const char *);
80 | extern void PyTokenizer_Free(struct tok_state *);
81 | extern int PyTokenizer_Get(struct tok_state *, char **, char **);
82 | 
83 | #define tok_dump _Py_tok_dump
84 | 
85 | #ifdef __cplusplus
86 | }
87 | #endif
88 | #endif /* !Py_TOKENIZER_H */
89 | 


--------------------------------------------------------------------------------