├── src
├── main.cpp
├── scanner.hpp
└── scanner.cpp
├── program.txt
└── README.md
/src/main.cpp:
--------------------------------------------------------------------------------
1 | #include "scanner.hpp"
2 |
3 | int main()
4 | {
5 | lexicalAnalyze("../program.txt");
6 | return 0;
7 | }
8 |
--------------------------------------------------------------------------------
/program.txt:
--------------------------------------------------------------------------------
1 | // This is the c test file
2 |
3 | int x ;
4 | x = x + 5 ;
5 | /*
6 | Multi line Comment
7 | */
8 |
9 | // Single line Comment
10 | for(int i = 0, i <= 4; i++){
11 | x += i;
12 | }
13 | x++;
14 |
15 | 54dsa
16 | _sda = "gdsdg";
17 |
--------------------------------------------------------------------------------
/src/scanner.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file scanner.hpp
3 | * @author Amirhossein Hakimnejad
4 | *
5 | * @date 2018 Nov
6 | */
7 |
8 |
9 | #pragma once
10 |
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 |
17 |
18 | using std::cout;
19 | using std::vector;
20 |
21 | bool isID(const std::string &str);
22 | bool isComment(const std::string &str);
23 | bool isDigit(const std::string &str);
24 | bool isString(const std::string &str);
25 | bool isBool(const std::string &str);
26 | bool isLiteral(const std::string &str);
27 | bool isKeyword(const std::string &str);
28 | bool isStatement(const std::string &str);
29 | bool isOperator(const std::string &str);
30 | bool isSeparator(const std::string &str);
31 | bool isNotLegal(const std::string &str);
32 | void printRoleOfToken(const vector& tokens);
33 | void lexicalAnalyze(const std::string &nameOfFile);
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Scanner-for-lexical-analyzer-in-cpp
2 | A simple c++ program that takes a file as input and after tokenizing the file it finds what each token name is(identifier, keyword, separator, operator, literal or comment).
3 |
4 |
5 | To see what a compiler's lexical analyzer is click [here](https://en.wikipedia.org/wiki/Lexical_analysis)
6 | ### Prerequisites
7 |
8 | You need c++11 or higher to run the code.
9 | To compile the code do:
10 | ```
11 | sudo apt-get install g++
12 | ```
13 |
14 | ### Using
15 |
16 | To use it, first you just need to clone it:
17 |
18 | ```
19 | git@github.com:amirhakimnejad/Scanner-for-lexical-analyzer-in-cpp.git
20 | ```
21 |
22 | Change the testfile (program.txt) as your desire:
23 | ```
24 | // This is the c test file
25 |
26 | int x ;
27 | x = x + 5 ;
28 | /*
29 | Multi line Comment
30 | */
31 |
32 | // Single line Comment
33 | for(int i = 0, i <= 4; i++){
34 | x += i;
35 | }
36 | x++;
37 |
38 | 54dsa
39 | _sda = "gdsdg";
40 |
41 | ```
42 | Compile with c++11:
43 |
44 | ```
45 | g++ main.cpp scanner.cpp -std=c++11
46 | ```
47 |
48 | Run it:
49 | ```
50 | ./a.out
51 | ```
52 |
53 | This will be your output:
54 | ```
55 | [(comment, //),
56 | (keyword, int),
57 | (identifier, x),
58 | (separator, ;),
59 | (identifier, x),
60 | (operator, =),
61 | (identifier, x),
62 | (operator, +),
63 | (literal, 5),
64 | (separator, ;),
65 | (comment, /*),
66 | (comment, //),
67 | (statement, for),
68 | (separator, (),
69 | (keyword, int),
70 | (identifier, i),
71 | (operator, =),
72 | (literal, 0),
73 | (separator, ,),
74 | (identifier, i),
75 | unknown
76 | ,
77 | (operator, =),
78 | (literal, 4),
79 | (separator, ;),
80 | (identifier, i),
81 | (operator, ++),
82 | (separator, )),
83 | (separator, {),
84 | (identifier, x),
85 | (operator, +=),
86 | (identifier, i),
87 | (separator, ;),
88 | (separator, }),
89 | (identifier, x),
90 | (operator, ++),
91 | (separator, ;),
92 | unknown
93 | ,
94 | (identifier, _sda),
95 | (operator, =),
96 | (literal, "gdsdg"),
97 | (separator, ;),
98 | ]
99 | ```
100 |
101 | As you can see there are few keywords or statements in my program. You can manually add any other values to their vector.
102 | ```c++
103 | const vector keywords{"int", "float", "auto", "double", "do", "switch", "return"};
104 | const vector statements{"for", "while"};
105 | const vector operators{"*", "+", "-", "/", "=", "-=", "*=", "+=", "/=", "++", "--", "=="};
106 | const vector Separators{"{", "}", ",", "(", ")", ";"};
107 | ```
108 |
109 |
110 |
111 | ### How each function works
112 | ```c++
113 | bool isID(const std::string &str);
114 | bool isComment(const std::string &str);
115 | bool isDigit(const std::string &str);
116 | bool isString(const std::string &str);
117 | bool isBool(const std::string &str);
118 | bool isLiteral(const std::string &str);
119 | bool isKeyword(const std::string &str);
120 | bool isStatement(const std::string &str);
121 | bool isOperator(const std::string &str);
122 | bool isSeparator(const std::string &str);
123 | bool isNotLegal(const std::string &str);
124 | void printRoleOfToken(const vector& tokens);
125 | void lexicalAnalyze(const std::string &nameOfFile);
126 | ```
127 | Most of the functions above don't need any explanation. Generally lexicalAnalyze() function tokenizes the given file to
128 | a vector of strings with ignoring whitespaces, newlines and the contents of comments with using the help of isOperator()
129 | isNotLegal() and isComment() because these are the only things that comes between main things of our program.
130 | ```
131 | for(i=5; i<= 10; i++){}
132 | ```
133 | After making tokens vector lexicalAnalyze() passes it to printRoleOfToken() function then the functions simply uses all
134 | other functions to print the name of each token.
135 | [Known tokens](https://en.wikipedia.org/wiki/Lexical_analysis#Token):
136 | ```
137 | identifier: names the programmer chooses;
138 | keyword: names already in the programming language;
139 | separator (also known as punctuators): punctuation characters and paired-delimiters;
140 | operator: symbols that operate on arguments and produce results;
141 | literal: numeric, logical, textual, reference literals;
142 | comment: line, block.
143 | ```
144 |
145 | Feel free to ask questions, find bugs(:D) or anything else.
146 |
147 | ## Authors
148 |
149 | * **Amirhossein Hakimnejad** - *Initial work* - [amirhakimnejad](https://github.com/amirhakimnejad)
150 | * **Henry** - *Contributor* - [henry-bugfree](https://github.com/henry-bugfree)
151 |
152 | ## License
153 |
154 | This project is licensed under the MIT License
155 |
--------------------------------------------------------------------------------
/src/scanner.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file scanner.cpp
3 | * @author Amirhossein Hakimnejad
4 | *
5 | * @date 2018 Nov
6 | */
7 |
8 |
9 | #include "scanner.hpp"
10 |
11 | using std::cout;
12 | using std::vector;
13 |
14 | bool isID(const std::string &str)
15 | {
16 | if(std::isdigit(str[0]))
17 | return false;
18 | int counter = 0;
19 | if(str[0] == '_')
20 | counter++;
21 |
22 | for(; counter < str.size(); counter++)
23 | if(!(isalnum(str[counter])))
24 | return false;
25 |
26 | return true;
27 | }
28 |
29 | bool isComment(const std::string &str)
30 | {
31 | return str == "/*" || str == "//";
32 | }
33 |
34 | bool isDigit(const std::string &str)
35 | {
36 | return std::all_of(str.begin(), str.end(), ::isdigit);
37 | }
38 |
39 | bool isString(const std::string &str)
40 | {
41 | return str[0] == '"' && str[str.size()-1] == '"';
42 | }
43 |
44 | bool isBool(const std::string &str)
45 | {
46 | return str == "true" || str == "false";
47 | }
48 |
49 | bool isLiteral(const std::string &str)
50 | {
51 | return isDigit(str) || isString(str) || isBool(str);
52 | }
53 |
54 | bool isKeyword(const std::string &str)
55 | {
56 | const vector keywords{"int", "float", "auto", "double", "do", "switch", "return"};
57 | for(const auto& keyword : keywords)
58 | if (keyword == str)
59 | return true;
60 |
61 | return false;
62 | }
63 |
64 | bool isStatement(const std::string &str)
65 | {
66 | const vector statements{"for", "while"};
67 | for(const auto& statement : statements)
68 | if (statement == str)
69 | return true;
70 |
71 | return false;
72 | }
73 |
74 | bool isOperator(const std::string &str)
75 | {
76 | const vector operators{"<", ">", "<=", ">=", "*", "+", "-", "/", "=", "-=", "*=", "+=", "/=", "++", "--", "=="};
77 | for(const auto& op : operators)
78 | if (op == str)
79 | return true;
80 |
81 | return false;
82 | }
83 |
84 | bool isSeparator(const std::string &str)
85 | {
86 | const vector Separators{"{", "}", ",", "(", ")", ";"};
87 | for(const auto& separate : Separators)
88 | if (separate == str)
89 | return true;
90 |
91 | return false;
92 | }
93 |
94 | bool isNotLegal(const std::string &str)
95 | {
96 | return str == " " || str == "\n";
97 | }
98 |
99 | void printRoleOfToken(const std::string& token)
100 | {
101 | if(isOperator(token))
102 | cout << "(operator, " << token << ")";
103 | else if(isSeparator(token))
104 | cout << "(separator, " << token << ")";
105 | else if(isKeyword(token))
106 | cout << "(keyword, " << token << ")";
107 | else if(isStatement(token))
108 | cout << "(statement, " << token << ")";
109 | else if(isLiteral(token))
110 | cout << "(literal, " << token << ")";
111 | else if(isID(token))
112 | cout << "(identifier, " << token << ")";
113 | else if(isComment(token))
114 | cout << "(comment, " << token << ")";
115 | else
116 | throw std::runtime_error("Invalid token: " + token);
117 | }
118 |
119 | void lexicalAnalyze(const std::string &nameOfFile)
120 | {
121 | char ch;
122 | std::string buffer;
123 | std::fstream file(nameOfFile, std::fstream::in);
124 |
125 | if (!file.is_open())
126 | {
127 | cout << "error while opening the file\n";
128 | exit(0);
129 | }
130 |
131 | bool miltiCm = false, singleCm = false;
132 | while (file >> std::noskipws >> ch)
133 | {
134 | if(singleCm || miltiCm)
135 | {
136 | if(singleCm && ch == '\n')
137 | singleCm = false;
138 |
139 | if(miltiCm && ch == '*')
140 | {
141 | file >> ch;
142 | if(ch == EOF)
143 | break;
144 |
145 | if(ch == '/')
146 | miltiCm = false;
147 | }
148 | continue;
149 | }
150 |
151 | if(ch == '/')
152 | {
153 | std::string comm(1, ch);
154 | file >> ch;
155 | if(ch == EOF)
156 | {
157 | printRoleOfToken(comm);
158 | break;
159 | }
160 |
161 | if(ch == '*')
162 | {
163 | miltiCm = true;
164 | comm += ch;
165 | }
166 | else if(ch == '/')
167 | {
168 | singleCm = true;
169 | comm += ch;
170 | }
171 | if(miltiCm || singleCm)
172 | {
173 | printRoleOfToken(comm);
174 | continue;
175 | }
176 | }
177 |
178 | if(isNotLegal(std::string(1, ch)))
179 | {
180 | if(!buffer.empty())
181 | {
182 | printRoleOfToken(buffer);
183 | buffer = "";
184 | }
185 | continue;
186 | }
187 |
188 | if(isOperator(std::string(1, ch)) && !isOperator(buffer))
189 | {
190 | if(!buffer.empty())
191 | {
192 | printRoleOfToken(buffer);
193 | buffer = "";
194 | }
195 | }
196 |
197 | if(!isOperator(std::string(1, ch)) && isOperator(buffer))
198 | {
199 | printRoleOfToken(buffer);
200 | buffer = "";
201 | }
202 |
203 | if(isSeparator(std::string(1, ch)))
204 | {
205 | if(!buffer.empty())
206 | {
207 | printRoleOfToken(buffer);
208 | buffer = "";
209 | }
210 | if(isSeparator(std::string(1, ch)))
211 | {
212 | printRoleOfToken(std::string(1, ch));
213 | continue;
214 | }
215 | }
216 | buffer += ch;
217 | }
218 | file.close();
219 | }
220 |
--------------------------------------------------------------------------------